From 9782bec7cfdb3d20ab631d2d18fcf8732f184f02 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Apr 2021 11:38:04 +0200 Subject: [PATCH 001/192] bug fix runSingle.py --- fdog/fDOGassembly.py | 4 +++- fdog/runSingle.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b802b26..f207516 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -597,6 +597,9 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' + print(assemblyDir) + + # user input has to be checked here before fDOGassembly continues @@ -725,7 +728,6 @@ def main(): return 1 ################## checking accepted genes for co-orthologs ########################## - print(reciprocal_sequences) reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 34d7fc1..a0ded09 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -437,8 +437,8 @@ def main(): assemblypath = cfg['assemblypath'] except: sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath + if assembly == True: + searchpath = assemblypath ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) From e56d87ac8f9b7dc5240ea9e6e090ca303648fdd1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 10:04:04 +0200 Subject: [PATCH 002/192] cleaning output --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f207516..515ddfe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -60,7 +60,7 @@ def parse_blast(line, blast_results): #print(line) line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) + print(line_info) evalue = float(line_info[3]) #cut off @@ -598,7 +598,7 @@ def main(): core_path = out + '/core_orthologs/' print(assemblyDir) - + # user input has to be checked here before fDOGassembly continues @@ -620,7 +620,7 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp') + os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1') ######################## consensus sequence ################################ @@ -659,7 +659,7 @@ def main(): searchBool = True ################### path definitions ################################### - os.system('mkdir ' + out + '/tmp/' + asName) + os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': @@ -740,7 +740,7 @@ def main(): if searchTaxon != '' and fasoff == False: fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('mkdir ' + tmp_path + 'anno_dir') + os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) From 766c89d34b02723403bd4a03296f30785d6c4feb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 10:56:26 +0200 Subject: [PATCH 003/192] testing --- fdog/fDOGassembly.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 515ddfe..d06e2bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,6 +31,11 @@ def merge(blast_results, insert_length): i = 1 while i < size_list-1: + a = locations[j][0] + b = locations[i][0] + c = locations[j][1] + d = locations[j][5] + e = locations[i][5] if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -60,7 +65,7 @@ def parse_blast(line, blast_results): #print(line) line = line.replace("\n", "") line_info = line.split("\t") - print(line_info) + #print(line_info) evalue = float(line_info[3]) #cut off @@ -597,7 +602,7 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' - print(assemblyDir) + #print(assemblyDir) From 48e41540d6ba403d974219a54c1563436ac54661 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:00:31 +0200 Subject: [PATCH 004/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d06e2bc..c317d8a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ def merge(blast_results, insert_length): j = 0 - while j < size_list-1: + while j < size_list-2: i = 1 while i < size_list-1: From 47f45d61f2875f61822e12f310e4b07d5eec20df Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:19:14 +0200 Subject: [PATCH 005/192] testing --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c317d8a..be7edaf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,6 +599,9 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() + else: + if not os.path.exists(out + '/group'): + os.system('mkdir ' + out + '/group') if core_path == '': core_path = out + '/core_orthologs/' From fe44e0bf0458909febf5e5c9bec2fecd85c5f7ee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:21:15 +0200 Subject: [PATCH 006/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index be7edaf..98e6480 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,6 +599,7 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() + os.system('mkdir ' + out + '/group') else: if not os.path.exists(out + '/group'): os.system('mkdir ' + out + '/group') From 34e87cac0ca8f4b4c24ef807223c2b7cecaa0dbc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:28:57 +0200 Subject: [PATCH 007/192] testing --- fdog/fDOGassembly.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 98e6480..e4434bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,10 +599,12 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() - os.system('mkdir ' + out + '/group') + os.system('mkdir ' + out + '/' + group) + out = out + '/' + group else: - if not os.path.exists(out + '/group'): - os.system('mkdir ' + out + '/group') + if not os.path.exists(out + '/' + group): + os.system('mkdir ' + out + '/' + group) + out = out + '/' + group if core_path == '': core_path = out + '/core_orthologs/' From 32bce0eb9c9d9e1193ea2a668240fbab0f5be18d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:32:12 +0200 Subject: [PATCH 008/192] testing --- fdog/fDOGassembly.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e4434bc..ae77ac3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -601,10 +601,6 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group - else: - if not os.path.exists(out + '/' + group): - os.system('mkdir ' + out + '/' + group) - out = out + '/' + group if core_path == '': core_path = out + '/core_orthologs/' From a8362e35e18c6f298227cd658b4a33ed5d6b3e8a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:38:23 +0200 Subject: [PATCH 009/192] testing --- fdog/fDOGassembly.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ae77ac3..d476d7a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -566,18 +566,8 @@ def main(): # print(out + "fdog.log \n") # sys.stdout = Logger(out) - try: - f = open(out + "fdog.log", "a+") - except FileNotFoundError: - f = open(out + "fdog.log", "w") - if silent == True: - sys.stderr = f - sys.stdout = f - else: - sys.stdout = Logger(f) - #checking paths if dataPath == '': @@ -605,6 +595,17 @@ def main(): core_path = out + '/core_orthologs/' #print(assemblyDir) + try: + f = open(out + "fdog.log", "a+") + except FileNotFoundError: + f = open(out + "fdog.log", "w") + + + if silent == True: + sys.stderr = f + sys.stdout = f + else: + sys.stdout = Logger(f) From 0458c252acb2c4077c551dcb3ddf361494617251 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:42:57 +0200 Subject: [PATCH 010/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d476d7a..f54a654 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -594,6 +594,7 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' + print(out) #print(assemblyDir) try: f = open(out + "fdog.log", "a+") From 1b07c9017814ece00a64adad8a97aac00e1ec89a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:54:09 +0200 Subject: [PATCH 011/192] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f54a654..d0b8610 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -591,10 +591,12 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group + if core_path == '': core_path = out + '/core_orthologs/' print(out) + print("test " + group + "\n" ) #print(assemblyDir) try: f = open(out + "fdog.log", "a+") From 2d3f8dda146d10082186dc1dce395c87f0949505 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:56:36 +0200 Subject: [PATCH 012/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d0b8610..a00876c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,9 +599,9 @@ def main(): print("test " + group + "\n" ) #print(assemblyDir) try: - f = open(out + "fdog.log", "a+") + f = open(out + "/fdog.log", "a+") except FileNotFoundError: - f = open(out + "fdog.log", "w") + f = open(out + "/fdog.log", "w") if silent == True: From afec218d459c6cc181dd80a7eebe7e41a74754a9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:59:34 +0200 Subject: [PATCH 013/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a00876c..33def84 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -588,6 +588,7 @@ def main(): if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' if out == '': + print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group From 9983e001ab8704188bf74168faf56b0e74a76def Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 12:05:45 +0200 Subject: [PATCH 014/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 33def84..87749bf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -530,7 +530,7 @@ def main(): assemblyDir = args.assemblyPath dataPath = args.dataPath core_path = args.coregroupPath - out = args.out + "/" + out = args.out pathFile = args.pathFile #I/O tmp = args.tmp @@ -591,7 +591,7 @@ def main(): print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) - out = out + '/' + group + out = out + '/' + group + '/' if core_path == '': core_path = out + '/core_orthologs/' From 4cca757f6fec7ffbe309996c5b6a8bc98a48a866 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 13:18:56 +0200 Subject: [PATCH 015/192] bug fix if augutus can't idetify a gene at a candidate region --- fdog/fDOGassembly.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 87749bf..03af975 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -146,16 +146,21 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - sequence_file = open(tmp_path + name + ".aa", "r") - lines = sequence_file.readlines() - for line in lines: - if line[0] == ">": - id = line.replace(">", "") - header = ">" + group + "|" + ass_name + "|" + name + "_" + id - output.write(header) - else: - output.write(line) - sequence_file.close() + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + print("No gene found by ID:" + name +" , continuing with next region") + + output.close() From d9bb72dcd0e1e359417d36edbc69de201aa29da6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:07:44 +0200 Subject: [PATCH 016/192] testing --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 03af975..8aa5f74 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -589,6 +589,8 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' + if core_path == '': + core_path = out + '/core_orthologs/' if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' @@ -598,8 +600,7 @@ def main(): os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' - if core_path == '': - core_path = out + '/core_orthologs/' + print(out) print("test " + group + "\n" ) @@ -659,9 +660,11 @@ def main(): else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" + print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + print(cmd) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") From ddec3f0909fb9695c90569b674084b4826a7aa9c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:13:53 +0200 Subject: [PATCH 017/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8aa5f74..e309f33 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -659,7 +659,7 @@ def main(): print("block profile is finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + group +"/"+ group + "_new.aln" + new_path = core_path + "/" + group +"/"+ group + "_new.aln" print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) From 13aea2d3c2233ba9b32e857275d7e39d6574a2a0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:16:14 +0200 Subject: [PATCH 018/192] bug fix --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e309f33..1691ac9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -591,6 +591,9 @@ def main(): dataPath = 'config' if core_path == '': core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' @@ -660,11 +663,11 @@ def main(): else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + "/" + group +"/"+ group + "_new.aln" - print(cmd) + #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - print(cmd) + #print(cmd) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") From 89a8843fd1c80f2fddb690f5f0505dbf6f8293ba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:17:13 +0200 Subject: [PATCH 019/192] cleaning up --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1691ac9..6ba8aa6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -662,7 +662,7 @@ def main(): print("block profile is finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + "/" + group +"/"+ group + "_new.aln" + new_path = core_path + group +"/"+ group + "_new.aln" #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) From 116acad39a7af8c56941b47a55fa96285ca7a132 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 09:50:21 +0200 Subject: [PATCH 020/192] testing --- fdog/fDOGassembly.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6ba8aa6..27dc85b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -428,6 +428,7 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) + print(candidate_name) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") @@ -441,8 +442,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break + for record in candidates: + print(record.id + "ID\n") for name in candidate_names: + print(name + "name\n") if name in record.id: f.write(">" + name + "\n") f.write(str(record.seq) + "\n") @@ -604,9 +608,6 @@ def main(): out = out + '/' + group + '/' - - print(out) - print("test " + group + "\n" ) #print(assemblyDir) try: f = open(out + "/fdog.log", "a+") From 0078ee440f5e933bb81c6fb4eb12b88e788b05e0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 10:03:56 +0200 Subject: [PATCH 021/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 27dc85b..3d7a243 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -428,7 +428,7 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) - print(candidate_name) + print(candidate_names) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") From c03e59dab4caf920874263fe4c6bc78ba4b36c25 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 10:12:08 +0200 Subject: [PATCH 022/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3d7a243..f4034b6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -376,6 +376,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(orthologs) + orthologs = set(orthologs) return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): From 366a4ab858870057f7df27f4bfc2ad99134932eb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 12:18:15 +0200 Subject: [PATCH 023/192] testing --- fdog/fDOGassembly.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f4034b6..d751f53 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -22,7 +22,7 @@ def merge(blast_results, insert_length): locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) #print("test") - #print(locations) + print(locations) size_list = len(locations) j = 0 @@ -59,23 +59,19 @@ def merge(blast_results, insert_length): #print(blast_results) return blast_results, number_regions -def parse_blast(line, blast_results): - # format blast line: - #fomrat dictionary: {node_name: [(,)]} - #print(line) +def parse_blast(line, blast_results, cutoff): + # format blast line: + #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) evalue = float(line_info[3]) - #cut off - if evalue > 0.00001: + if evalue > cutoff: return blast_results, evalue #add region to dictionary else: node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] split = node_name.split("|") - # finding out on which strand tBLASTn founded a hit if sstart < send: strand = "+" @@ -83,7 +79,6 @@ def parse_blast(line, blast_results): sstart = line_info[2] send = line_info[1] strand = "-" - #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001 if len(split) > 1: node_name = split[1] @@ -96,7 +91,7 @@ def parse_blast(line, blast_results): return blast_results, evalue -def candidate_regions(intron_length, evalue, tmp_path): +def candidate_regions(intron_length, cutoff_evalue, tmp_path): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -109,9 +104,9 @@ def candidate_regions(intron_length, evalue, tmp_path): if not line: break #parsing blast output - blast_results, evalue = parse_blast(line, blast_results) + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off - if not evalue <= evalue: + if not evalue <= cutoff_evalue: break if blast_results == {}: return 0,0 @@ -429,7 +424,6 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) - print(candidate_names) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") @@ -445,9 +439,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: - print(record.id + "ID\n") for name in candidate_names: - print(name + "name\n") if name in record.id: f.write(">" + name + "\n") f.write(str(record.seq) + "\n") @@ -603,7 +595,7 @@ def main(): if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' if out == '': - print('test out \n') + #print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' From 79f2b67802f76f5a3fbb003efbe9fd39f7db70df Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 13:43:34 +0200 Subject: [PATCH 024/192] bug fix in merge function, regions in minus strand were not merged correctly --- fdog/fDOGassembly.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d751f53..a3480a3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -36,27 +36,42 @@ def merge(blast_results, insert_length): c = locations[j][1] d = locations[j][5] e = locations[i][5] - if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])): + if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5])): + elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 + elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #print(j) + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + i += 1 j += 1 number_regions += len(locations) blast_results[key] = locations - #print(blast_results) + print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): From 5425cd138dc47933a0f019896e1fe5db337d5ad0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:09:51 +0200 Subject: [PATCH 025/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a3480a3..9694c6d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -45,7 +45,7 @@ def merge(blast_results, insert_length): i -= 1 elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions - locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 From 174cc0c834c6ea1c9fb89b553dfed24e89570778 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:10:11 +0200 Subject: [PATCH 026/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9694c6d..be67237 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -59,7 +59,7 @@ def merge(blast_results, insert_length): i -=1 elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) - locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 From ccc3e4eb0d0aae6eedae7b61e0ab1761ebcf31a2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:28:53 +0200 Subject: [PATCH 027/192] testing --- fdog/fDOGassembly.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index be67237..7ae65c0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,11 +31,6 @@ def merge(blast_results, insert_length): i = 1 while i < size_list-1: - a = locations[j][0] - b = locations[i][0] - c = locations[j][1] - d = locations[j][5] - e = locations[i][5] if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -79,6 +74,7 @@ def parse_blast(line, blast_results, cutoff): #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") + print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: @@ -87,14 +83,14 @@ def parse_blast(line, blast_results, cutoff): else: node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] split = node_name.split("|") - # finding out on which strand tBLASTn founded a hit + # finding out on which strand tBLASTn found a hit if sstart < send: strand = "+" else: sstart = line_info[2] send = line_info[1] strand = "-" - #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001 + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off if len(split) > 1: node_name = split[1] if node_name in blast_results: From e2cb392d31015b99f49cca38b68f7cfacb28e7a6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:35:26 +0200 Subject: [PATCH 028/192] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7ae65c0..95fe32b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -81,14 +81,14 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] + node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: strand = "+" else: - sstart = line_info[2] - send = line_info[1] + sstart = int(line_info[2]) + send = int(line_info[1]) strand = "-" #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off if len(split) > 1: From 6c9b25828e68d0a5dc79f7ed2dd28fcfb3d42aa4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:07:17 +0200 Subject: [PATCH 029/192] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 95fe32b..f8d6487 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -38,7 +38,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) @@ -52,7 +52,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) @@ -74,7 +74,7 @@ def parse_blast(line, blast_results, cutoff): #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - print(line_info) + #print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: From b9c055ead8880df456dd1c5fc154bd79c0051f0b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:21:29 +0200 Subject: [PATCH 030/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f8d6487..996bec6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ def merge(blast_results, insert_length): j = 0 - while j < size_list-2: + while j < size_list-1: i = 1 while i < size_list-1: From 79df315ba23f40bf8205221880a062d81f48b8ed Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:30:42 +0200 Subject: [PATCH 031/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 996bec6..f4da667 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -18,6 +18,7 @@ def load_config(config_file): def merge(blast_results, insert_length): number_regions = 0 + insert_length = int(insert_length) for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) From 0bc70a06235d836dd3c91ff98e2c16de16473364 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:36:56 +0200 Subject: [PATCH 032/192] testing --- fdog/fDOGassembly.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f4da667..02ff236 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,11 +27,9 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-1: - i = 1 + i = j+1 while i < size_list-1: - if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -46,7 +44,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) From a31d5e9acf2fcac7d1d588af42edbc22e6219bdf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:49:20 +0200 Subject: [PATCH 033/192] testing --- fdog/fDOGassembly.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 02ff236..d4e0518 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,9 +27,12 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-1: - i = j+1 + while j < size_list-2: + i = j + 1 while i < size_list-1: + print("Vergleich \n") + print(locations[j] + "\n") + print(locations[i] + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -37,6 +40,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 + print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) @@ -44,6 +48,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 + print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) @@ -51,6 +56,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 + print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) @@ -58,6 +64,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 + print("Insert-") i += 1 j += 1 From 55137f49c3e4ba3986239084cbe002713257a888 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:52:54 +0200 Subject: [PATCH 034/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d4e0518..b3d78f9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,8 +31,8 @@ def merge(blast_results, insert_length): i = j + 1 while i < size_list-1: print("Vergleich \n") - print(locations[j] + "\n") - print(locations[i] + "\n") + print(str(locations[j]) + "\n") + print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) From ab85180e94e60515963a1190386c0c68ed39e771 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 16:00:17 +0200 Subject: [PATCH 035/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b3d78f9..f1e3771 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,9 +27,9 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-2: + while j < size_list-1: i = j + 1 - while i < size_list-1: + while i < size_list: print("Vergleich \n") print(str(locations[j]) + "\n") print(str(locations[i]) + "\n") From f66f72c5638323cc7d22b6f73bea38ce20f6cf2b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 13 Apr 2021 09:47:12 +0200 Subject: [PATCH 036/192] clean up --- fdog/fDOGassembly.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f1e3771..3b499a1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -23,16 +23,16 @@ def merge(blast_results, insert_length): locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) #print("test") - print(locations) + #print(locations) size_list = len(locations) j = 0 while j < size_list-1: i = j + 1 while i < size_list: - print("Vergleich \n") - print(str(locations[j]) + "\n") - print(str(locations[i]) + "\n") + #print("Vergleich \n") + #print(str(locations[j]) + "\n") + #print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -40,7 +40,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - print("M+") + #print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) @@ -48,7 +48,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - print("M-") + #print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) @@ -56,7 +56,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - print("Insert+") + #print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) @@ -64,7 +64,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - print("Insert-") + #print("Insert-") i += 1 j += 1 @@ -72,7 +72,7 @@ def merge(blast_results, insert_length): number_regions += len(locations) blast_results[key] = locations - print(blast_results) + #print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): From f573dc4776fac4a9df2513191bcae389f365a9c1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:33:44 +0200 Subject: [PATCH 037/192] testing --- fdog/fDOGassembly.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3b499a1..c64a244 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -77,7 +77,7 @@ def merge(blast_results, insert_length): def parse_blast(line, blast_results, cutoff): # format blast line: - #fomrat dictionary: {node_name: [(,,evalue, ,,)]} + # format dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") #print(line_info) @@ -123,7 +123,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off + print(evalue + " evalue candidate region \n") + print(cutoff + " cutoff evalue \n") if not evalue <= cutoff_evalue: + print("break \n") break if blast_results == {}: return 0,0 From 4dad8869a6ee3d5d013d1bbe4586f161455f19c6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:41:24 +0200 Subject: [PATCH 038/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c64a244..126decf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -131,6 +131,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): if blast_results == {}: return 0,0 else: + print(blast_results) candidate_regions, number_regions = merge(blast_results, intron_length) #candidate_regions, number_regions = merge_regions(blast_results, cut_off) #print(candidate_regions, number_regions) From ef9c17fda354bc4cd5b7954f6f93d4cadf360aba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:43:08 +0200 Subject: [PATCH 039/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 126decf..c69733a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -123,8 +123,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off - print(evalue + " evalue candidate region \n") - print(cutoff + " cutoff evalue \n") + print(str(evalue) + " evalue candidate region \n") + print(str(cutoff) + " cutoff evalue \n") if not evalue <= cutoff_evalue: print("break \n") break From e5b06e1d279195a08c6e94b79dafec192b0d82f4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:44:49 +0200 Subject: [PATCH 040/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c69733a..1b1c5f7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -124,7 +124,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off print(str(evalue) + " evalue candidate region \n") - print(str(cutoff) + " cutoff evalue \n") + print(str(cutoff_evalue) + " cutoff evalue \n") if not evalue <= cutoff_evalue: print("break \n") break From 7e0377db68470f2a2cdaefa308f1def70250fcbf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:44:50 +0200 Subject: [PATCH 041/192] bug fix --- fdog/fDOGassembly.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b1c5f7..0485db0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -122,12 +122,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): break #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) - #evalue cut-off - print(str(evalue) + " evalue candidate region \n") - print(str(cutoff_evalue) + " cutoff evalue \n") - if not evalue <= cutoff_evalue: - print("break \n") - break + if blast_results == {}: return 0,0 else: @@ -731,7 +726,7 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -out ' + tmp_path + '/blast_results.out') + os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### From 721cfffea9d3837bb49b1a52c91dc6d362f18474 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:49:12 +0200 Subject: [PATCH 042/192] testing new tblastn call --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0485db0..8f47d98 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -726,7 +726,7 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out') + os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### From 496bb1f8c1dd1b0b158d36c99b567faeae7e67ca Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:55:54 +0200 Subject: [PATCH 043/192] testing --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8f47d98..b9ee3f4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -126,9 +126,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): if blast_results == {}: return 0,0 else: - print(blast_results) candidate_regions, number_regions = merge(blast_results, intron_length) - #candidate_regions, number_regions = merge_regions(blast_results, cut_off) + print(candidate_regions) #print(candidate_regions, number_regions) return candidate_regions, number_regions From 2cdc82d53fc5dab4a82ddd9e03fbccc0d003d399 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:19:54 +0200 Subject: [PATCH 044/192] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b9ee3f4..5d2f9e9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -127,7 +127,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - print(candidate_regions) + #print(candidate_regions) #print(candidate_regions, number_regions) return candidate_regions, number_regions @@ -750,7 +750,7 @@ def main(): ################# backward search to filter for orthologs################### reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) - + print(reciprocal_sequences) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': @@ -761,7 +761,7 @@ def main(): ################## checking accepted genes for co-orthologs ########################## reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - + print(reciprocal_sequences) ################ add sequences to extended.fa in the output folder########## From 3898d4ee8869332c76250593c2e2c391ad933c46 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:27:00 +0200 Subject: [PATCH 045/192] testing --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5d2f9e9..842d67f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -437,6 +437,9 @@ def checkOptions(): #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): + if len(candidate_name) == 1: + return candidate_name + candidates = readFasta(candidatesFile) ref = readFasta(fasta) From e1fec1af78f1f59e43d4c4f1be83cbbfa67b661d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:28:40 +0200 Subject: [PATCH 046/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 842d67f..d31af58 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -437,8 +437,8 @@ def checkOptions(): #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): - if len(candidate_name) == 1: - return candidate_name + if len(candidate_names) == 1: + return candidate_names candidates = readFasta(candidatesFile) ref = readFasta(fasta) From 65c1e1e0ae34b9bb948de5e2511cca1cc29f6781 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 18 Apr 2021 19:40:44 +0200 Subject: [PATCH 047/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d31af58..80582bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -477,11 +477,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for name in candidate_names: distance = distances[ref_id , name] - if distance < min_dist: + if distance <= min_dist: min_dist = distance min_name = name - checked = [] + for name in candidate_names: From 34b2ee591f76296e48dfe27bdd46d6e3d6e666fd Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 20 Apr 2021 15:47:17 +0200 Subject: [PATCH 048/192] code clean up --- fdog/fDOGassembly.py | 120 ++++++++++++------------------------------- 1 file changed, 33 insertions(+), 87 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 80582bc..44e7607 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -17,6 +17,7 @@ def load_config(config_file): print(exc) def merge(blast_results, insert_length): + #merging overlapping and contigous candidate regions number_regions = 0 insert_length = int(insert_length) for key in blast_results: @@ -25,54 +26,44 @@ def merge(blast_results, insert_length): #print("test") #print(locations) size_list = len(locations) - j = 0 while j < size_list-1: i = j + 1 while i < size_list: - #print("Vergleich \n") - #print(str(locations[j]) + "\n") - #print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): - #merge overlapping regions + #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - #print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): - #merge overlapping regions + #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - #print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): - #print(j) + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 - #print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): - #print(j) + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 - #print("Insert-") - i += 1 j += 1 number_regions += len(locations) blast_results[key] = locations - #print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): @@ -80,7 +71,6 @@ def parse_blast(line, blast_results, cutoff): # format dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: @@ -127,12 +117,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - #print(candidate_regions) - #print(candidate_regions, number_regions) + return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path): - #print(region_dic) + for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" @@ -145,17 +134,18 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug locations = regions[key] counter = 0 for i in locations: + # some variables counter += 1 start = str(i[0] - length_extension) end = str(i[1] + length_extension) name = key + "_" + str(counter) - #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff") - + # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - + # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") lines = sequence_file.readlines() @@ -168,20 +158,15 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found by ID:" + name +" , continuing with next region") - - - + print("No gene found in region with ID:" + name + " , continuing with next region") output.close() def searching_for_db(assembly_path): - #print("test: " + str(assembly_path) + "\n") + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] check = True for end in db_endings: - #print(assembly_path + end + "\n") check = check and os.path.exists(assembly_path + end) - #print(check) return check def get_distance_biopython(file, matrix): @@ -240,8 +225,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - #d_ref = get_distance(aln_file, best_hit, ref) - #d = get_distance(aln_file, best_hit, candidate_name) distances = get_distance_biopython(aln_file, matrix) distance_hit_query = distances[best_hit, candidate_name] @@ -390,9 +373,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): - #print("addSequences") - #print(sequenceIds) - #print(species_list) + output_file = open(output, "a+") if refBool == False: seq_records_core = readFasta(core_fasta) @@ -406,8 +387,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: - #print(entry_candidate.id) - #print(sequenceIds) if entry_candidate.id in sequenceIds: output_file.write(">" + entry_candidate.id + "\n") output_file.write(str(entry_candidate.seq) + "\n") @@ -455,7 +434,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break - for record in candidates: for name in candidate_names: if name in record.id: @@ -465,9 +443,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci if msaTool == "muscle": os.system("muscle -quiet -in " + out + " -out " + aln_file) - #print("muscle -quiet -in " + output_file + " -out " + aln_file) elif msaTool == "mafft-linsi": - #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) distances = get_distance_biopython(aln_file, matrix) @@ -481,9 +457,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name - - - for name in candidate_names: if distances[min_name , name] < distances[min_name , ref_id]: checked.append(name) @@ -577,18 +550,6 @@ def main(): searchTaxon = args.searchTaxon silent = args.silent - ###################### How to handling std output ########################## - # if silent == True: - # print(out + "fdog.log \n") - # f = open(out + "fdog.log", "a+") - # sys.stdout = f - # else: - # print(out + "fdog.log \n") - # sys.stdout = Logger(out) - - - - #checking paths if dataPath == '': fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') @@ -618,31 +579,25 @@ def main(): os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' - - #print(assemblyDir) try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: f = open(out + "/fdog.log", "w") + ################## How to handle std output and std error ################## if silent == True: sys.stderr = f sys.stdout = f else: sys.stdout = Logger(f) - - - # user input has to be checked here before fDOGassembly continues - assembly_names = os.listdir(assemblyDir) - - ########################## some variables ################################## refBool = False # checks if sequences of reference species were already part of the extended.fa file + ########### paths ########### msa_path = core_path + "/" + group +"/"+ group + ".aln" @@ -658,18 +613,16 @@ def main(): ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence \n") + print("Building a consensus sequence for gene " + group + " \n") os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) print("consensus sequence is finished\n") ######################## block profile ##################################### - print("Building a block profile \n") + print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - #os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - #print(os.path.getsize(profile_path)) if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") else: @@ -683,9 +636,9 @@ def main(): result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") - searchBool = False + #################### fDOG assembly computation for all species ############# for asName in assembly_names: if searchBool == True: break @@ -694,6 +647,7 @@ def main(): searchBool = True ################### path definitions ################################### + os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" @@ -706,16 +660,13 @@ def main(): fasOutFile = out + "/" + group mappingFile = out + "/tmp/" + group + ".mapping.txt" - print("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - ######################## tBLASTn ########################################### - - #database anlegen + ######################## tBLASTn ########################################### + #checks if data base exists already db_check = searching_for_db(db_path) - #print(assembly_path) if db_check == 0: print("creating a blast data base \n") os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path) @@ -723,16 +674,13 @@ def main(): else: print('blast data base exists already, continuing...') - - #make a tBLASTn search against the new database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - + #makes a tBLASTn search against the new database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) @@ -740,20 +688,20 @@ def main(): #no candidat region are available, no ortholog can be found print("No candidate region found") continue - else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path) ############### make Augustus PPX search ################################### + print("starting augustus ppx \n") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path) print("augustus is finished \n") ################# backward search to filter for orthologs################### + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) - print(reciprocal_sequences) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': @@ -762,35 +710,34 @@ def main(): cleanup(tmp, tmp_path) return 1 - ################## checking accepted genes for co-orthologs ########################## - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - print(reciprocal_sequences) + ################## checking accepted genes for co-orthologs ################ + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) refBool = True ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) - - + #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 - + #if we searched in more than one taxon if fasoff == False and searchTaxon == '': tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group ) - ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) @@ -799,6 +746,5 @@ def main(): f.close() - if __name__ == '__main__': main() From 6546b530bc1209d940d50916667ef3ae213a6595 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 09:42:31 +0200 Subject: [PATCH 049/192] clean up code --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 44e7607..bc8eb54 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -576,7 +576,7 @@ def main(): if out == '': #print('test out \n') out = os.getcwd() - os.system('mkdir ' + out + '/' + group) + os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1') out = out + '/' + group + '/' try: From 583536554383b3222ce0a01eee343571d234cbec Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:01:48 +0200 Subject: [PATCH 050/192] clean up --- .DS_Store | Bin 6148 -> 6148 bytes fdog/fDOGassembly.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.DS_Store b/.DS_Store index fa2521e2436140a5f3689d5732ee4d25d777342f..ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab 100644 GIT binary patch delta 140 zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7iv?Ji7&#{MFxfMnnC!#Uz#?82!uF&z$_^q@4UD1_lNJ1_q|| m%`cdrGHzz);O79^wV9FSJM(0I5l0S2pd3&M!{!K)HOv4JL?SK# delta 118 zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3P4;1FV3Vw_HZ(Aqe1Sz-aTWs@ zFfu}D27V|Fqh?PQVSTt+j6;BBGdl-A2hh~bf*jwOC-aLqa)8tT^|5S@5Lv?v0F#at Ad;kCd diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc8eb54..1c2f21a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -576,7 +576,7 @@ def main(): if out == '': #print('test out \n') out = os.getcwd() - os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1') + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' try: @@ -608,7 +608,7 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1') + os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1') ######################## consensus sequence ################################ From 421580d7895fb76f32ed79820b9d652516af7bf3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:12:24 +0200 Subject: [PATCH 051/192] clean up --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1c2f21a..2c57503 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -125,7 +125,7 @@ def extract_seq(region_dic, path, tmp_path): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): output = open(candidatesOutFile, "w") @@ -457,8 +457,10 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name + checked = [] + for name in candidate_names: - if distances[min_name , name] < distances[min_name , ref_id]: + if distances[min_name , name] <= distances[min_name , ref_id]: checked.append(name) return checked From 89dfaf0290ada42714b958057d16e537570b5beb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:20:06 +0200 Subject: [PATCH 052/192] reduce output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2c57503..b2d2afa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -144,7 +144,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") From ecf29edbc63829f9ee2cfedd872f2b5f4d857c67 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 22 Apr 2021 11:34:21 +0200 Subject: [PATCH 053/192] clean up code --- fdog/fDOGassembly.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b2d2afa..03f998a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -724,10 +724,12 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") @@ -735,10 +737,12 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group ) + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) ################# remove tmp folder ######################################## if searchTaxon != '': From 55a9e6c2ce2fabc8d2872371e6bbe0dc7599407a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 24 Apr 2021 11:05:32 +0200 Subject: [PATCH 054/192] check augustus --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 03f998a..b028245 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -141,7 +141,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug name = key + "_" + str(counter) # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) From d2492d036e66777104e1277f2035eebee6960f65 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 24 Apr 2021 11:12:34 +0200 Subject: [PATCH 055/192] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b028245..5e85998 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -125,7 +125,8 @@ def extract_seq(region_dic, path, tmp_path): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): output = open(candidatesOutFile, "w") From 2c0d152f76f9d1540e273822417f2ef9c224abaa Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:20:59 +0200 Subject: [PATCH 056/192] adding option to recognize if co-ortholog or not in header of the extended.fa --- fdog/fDOGassembly.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5e85998..bc3a290 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -390,8 +390,12 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - output_file.write(">" + entry_candidate.id + "\n") - output_file.write(str(entry_candidate.seq) + "\n") + if entry_candidate == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") output_file.close() return 0 @@ -459,10 +463,12 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name - checked = [] + checked = [min_name] for name in candidate_names: - if distances[min_name , name] <= distances[min_name , ref_id]: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: checked.append(name) return checked From 4b19832344ea880614875e6923f9f793b2202f87 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:25:54 +0200 Subject: [PATCH 057/192] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc3a290..6d5059f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -390,7 +390,8 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - if entry_candidate == sequenceIds[0]: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -751,7 +752,7 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - + print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From db4c6a57fff0939bbae951a9c0ae3b1dc3c3384e Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:34:36 +0200 Subject: [PATCH 058/192] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6d5059f..2f780c5 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "\n") + output_file.write(">" + entry_core.id + "|1" + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) @@ -403,6 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] + fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From f4871452939fa6b9952f1293b46c2aa3b2376464 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:54:12 +0200 Subject: [PATCH 059/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2f780c5..9ea9837 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -752,7 +752,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': From 43b73b0a63bea0b3b72557ec19fc1fe9b7ed2574 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:02:08 +0200 Subject: [PATCH 060/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9ea9837..c549076 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - fas_seed_id = fas_seed_id.split("|")[0] + #fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From 620d5fa9cf37883ccd9e14556af6513e993559d5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:34:04 +0200 Subject: [PATCH 061/192] testing --- fdog/fDOGassembly.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc3a290..d13cbc8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,14 +383,15 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "\n") + output_file.write(">" + entry_core.id + "|1" + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - if entry_candidate == sequenceIds[0]: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -750,8 +751,8 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - + result = subprocess.run(cmd, shell=True) + print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From ac3477362a0e7339dbc6de19460e79cc26d8ce58 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:41:07 +0200 Subject: [PATCH 062/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c549076..d20968e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "|1" + "\n") + output_file.write(">" + entry_core.id + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) From 86337fcb7b7884c0865bef1b56bd3f1daf26385a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:42:09 +0200 Subject: [PATCH 063/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d20968e..e8100ec 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - #fas_seed_id = fas_seed_id.split("|")[0] + fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From 507238052124d6ea6e0c4f45594ff51d741a1614 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:47:37 +0200 Subject: [PATCH 064/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e8100ec..d20968e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - fas_seed_id = fas_seed_id.split("|")[0] + #fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From df6d32467000ee0c350e313e68d118d2bbfcf90d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 10:54:45 +0200 Subject: [PATCH 065/192] added function starting_subprocess() to handle call of extern tools more easily --- fdog/fDOGassembly.py | 69 ++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d20968e..a7c24ed 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -16,6 +16,14 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) +def starting_subprocess(cmd, mode): + if mode == 'debug': + result = subprocess.run(cmd, shell=True) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + elif mode == 'normal': + result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions number_regions = 0 @@ -120,15 +128,14 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return candidate_regions, number_regions -def extract_seq(region_dic, path, tmp_path): +def extract_seq(region_dic, path, tmp_path, mode): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) - result = subprocess.run(cmd, shell=True) + starting_subprocess(cmd, mode) -def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): output = open(candidatesOutFile, "w") for key in regions: @@ -143,10 +150,10 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - result = subprocess.run(cmd, shell=True) + starting_subprocess(cmd, mode) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") @@ -524,6 +531,8 @@ def main(): optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + args = parser.parse_args() @@ -561,6 +570,18 @@ def main(): fasoff = args.fasoff searchTaxon = args.searchTaxon silent = args.silent + debug = args.debug + + if debug == True and silent == True: + print("It's not possible to use booth modes, please restart and use --debug or --silent") + return 1 + else: + if debug == True: + mode = 'debug' + elif silent == True: + mode = 'silent' + else: + mode = 'normal' #checking paths if dataPath == '': @@ -598,11 +619,12 @@ def main(): ################## How to handle std output and std error ################## - if silent == True: + if mode == 'silent': sys.stderr = f sys.stdout = f else: sys.stdout = Logger(f) + # user input has to be checked here before fDOGassembly continues assembly_names = os.listdir(assemblyDir) @@ -620,20 +642,22 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1') + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer print("Building a consensus sequence for gene " + group + " \n") - os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) print("consensus sequence is finished\n") ######################## block profile ##################################### print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") @@ -642,10 +666,10 @@ def main(): new_path = core_path + group +"/"+ group + "_new.aln" #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path #print(cmd) - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) print("block profile is finished \n") searchBool = False @@ -660,7 +684,8 @@ def main(): ################### path definitions ################################### - os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': @@ -681,7 +706,8 @@ def main(): db_check = searching_for_db(db_path) if db_check == 0: print("creating a blast data base \n") - os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path) + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) print("database is finished \n") else: print('blast data base exists already, continuing...') @@ -689,7 +715,8 @@ def main(): #makes a tBLASTn search against the new database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + starting_subprocess(cmd, mode) print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### @@ -702,12 +729,12 @@ def main(): continue else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") - extract_seq(regions, db_path, tmp_path) + extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path) + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) print("augustus is finished \n") ################# backward search to filter for orthologs################### @@ -737,9 +764,10 @@ def main(): print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") @@ -752,8 +780,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, shell=True) - print(cmd) + starting_subprocess(cmd, mode) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 7187972986ee69a27b472104d981455498c208bb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 10:55:29 +0200 Subject: [PATCH 066/192] added augustus to dependencies --- .DS_Store | Bin 6148 -> 6148 bytes fdog/setup/setup_conda.sh | 3 +++ 2 files changed, 3 insertions(+) diff --git a/.DS_Store b/.DS_Store index ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab..824f712743a6414728f27d69a840e656771a9cdf 100644 GIT binary patch delta 68 zcmZoMXffE}&BAzUvJXoGn`Cvhp@GTd3oOctvlzgDkr6^O@Iz@BHG8uN>qn-|>>Pjj E0r)Enz5oCK delta 40 wcmZoMXffE}&BAzMvJXoGi+FXl!Q@LU%8WB6e`AqnoVi(y^&`_}R*wJt02)~h&j0`b diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index fae81b7..ddc4e23 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -116,6 +116,7 @@ dependencies=( mafft # for linsi muscle fasta36 + augustus #for fdog.assembly ) for i in "${dependencies[@]}"; do @@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do fi elif [ "$tool" = "fasta36" ]; then conda install -y -c bioconda fasta3 + elif [ "$tool" = "augustus" ]; then + conda install -y -c bioconda augustus else conda install -y -c bioconda $i fi From 721bcdbaa9c0db7055c9bd3e4c0001cd613ea045 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 12:55:26 +0200 Subject: [PATCH 067/192] testing --- fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a7c24ed..bdaf93b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -247,7 +247,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #rejected return 0, distance_ref_hit, distance_hit_query -def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path): +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): # the backward search uses the genes predicted from augustus and makes a blastp search #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa seedDic = getSeedInfo(fasta_path) @@ -263,7 +263,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": - os.system("blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) else: print("diamonds are the girls best friends") ##### diamond call @@ -348,7 +349,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed - os.system("blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) alg_file = open(tmp_path + "/blast_" + species, "r") lines = alg_file.readlines() alg_file.close() @@ -393,17 +395,18 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(">" + entry_core.id + "\n") output_file.write(str(entry_core.seq) + "\n") - seq_records_candidate = readFasta(candidate_fasta) - seq_records_candidate = list(seq_records_candidate) - for entry_candidate in seq_records_candidate: - if entry_candidate.id in sequenceIds: - if entry_candidate.id == sequenceIds[0]: - print(entry_candidate.id) - output_file.write(">" + entry_candidate.id + "|1" + "\n") - output_file.write(str(entry_candidate.seq) + "\n") - else: - output_file.write(">" + entry_candidate.id + "|0" + "\n") - output_file.write(str(entry_candidate.seq) + "\n") + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") output_file.close() return 0 @@ -738,16 +741,24 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) > 0: + print("No genes found at candidate regions\n") + if searchTaxon == '': + continue + else: + addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + return 0 - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': continue else: + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) cleanup(tmp, tmp_path) - return 1 + return 0 ################## checking accepted genes for co-orthologs ################ From 9a2e4d00a97cff812e623b4bf219e581ae08922b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:06:34 +0200 Subject: [PATCH 068/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bdaf93b..c22e515 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -741,7 +741,7 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) > 0: + if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") if searchTaxon == '': continue From 1e5893b85c169899ed0ace275dcb3ff89ee5cdef Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:30:51 +0200 Subject: [PATCH 069/192] testing --- fdog/fDOGassembly.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c22e515..e52b8a4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -22,7 +22,7 @@ def starting_subprocess(cmd, mode): elif mode == 'silent': result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) elif mode == 'normal': - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions @@ -485,6 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked +def changes_for_fas(file, header, mode): + #def replace_first_line( src_filename, target_filename, replacement_line): + f_in = open(file) + first_line, remainder = f.readline(), f.read() + line = first_line.split("|")[0] + f_in.close() + f_out = open(file + "s","w") + f_out.write(line + "\n") + f_out.write(remainder) + f_out.close() + class Logger(object): def __init__(self, file): self.file = file @@ -746,23 +757,20 @@ def main(): if searchTaxon == '': continue else: - addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - return 0 + reciprocal_sequences = 0 + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': continue else: - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - cleanup(tmp, tmp_path) - return 0 - - ################## checking accepted genes for co-orthologs ################ - - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + reciprocal_sequences = 0 + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## From e8440c86fcec447a0ff1d98ffd3d1940139a69bb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:42:49 +0200 Subject: [PATCH 070/192] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e52b8a4..688a000 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -754,7 +754,7 @@ def main(): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") - if searchTaxon == '': + if searchTaxon == '' and refBool == True: continue else: reciprocal_sequences = 0 @@ -765,7 +765,7 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") - if searchTaxon == '': + if searchTaxon == '' and refBool == True: continue else: reciprocal_sequences = 0 @@ -788,6 +788,7 @@ def main(): cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, mode) #if we searched in more than one Taxon and no ortholog was found + if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) From 6362e47e45034fd026cfbc2e3319c3266a2c9d65 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:52:41 +0200 Subject: [PATCH 071/192] testing --- fdog/fDOGassembly.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 688a000..08cdfaa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -739,21 +739,23 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found") - continue + if refBool == True: + print("No candidate region found") + continue else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") + print("starting augustus ppx \n") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0: + if regions != 0: + print("No genes found at candidate regions\n") if searchTaxon == '' and refBool == True: continue else: From 02ad76cd791f0c7d202f443ca2a0665a13271c3a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:58:03 +0200 Subject: [PATCH 072/192] testing --- fdog/fDOGassembly.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 08cdfaa..02f627f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -753,15 +753,14 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0: - if regions != 0: + if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") - if searchTaxon == '' and refBool == True: - continue + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 else: - reciprocal_sequences = 0 - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) ################## checking accepted genes for co-orthologs ################ @@ -772,7 +771,10 @@ def main(): else: reciprocal_sequences = 0 else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + if regions != 0 + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + else: + reciprocal_sequences = 0 ################ add sequences to extended.fa in the output folder########## From ac929b7f87c55870f83cb2201d1bad8e4a2d56c2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:01:00 +0200 Subject: [PATCH 073/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 02f627f..c98f6a7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -771,7 +771,7 @@ def main(): else: reciprocal_sequences = 0 else: - if regions != 0 + if regions != 0: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) else: reciprocal_sequences = 0 From 060b4bb10297df20b627a6b71324c4926eef616a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:03:49 +0200 Subject: [PATCH 074/192] testing --- fdog/fDOGassembly.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c98f6a7..524b83f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -739,6 +739,7 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found + reciprocal_sequences = 0 if refBool == True: print("No candidate region found") continue @@ -771,10 +772,7 @@ def main(): else: reciprocal_sequences = 0 else: - if regions != 0: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - else: - reciprocal_sequences = 0 + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## From c996ca6287c601856bce1ab849bcd4bdaf9f86bf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:10:07 +0200 Subject: [PATCH 075/192] testing --- fdog/fDOGassembly.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 524b83f..07dbe83 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -384,6 +384,10 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + print(output) + print(refBool) + print(core_fasta) + print(species_list) output_file = open(output, "a+") if refBool == False: @@ -739,10 +743,11 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - reciprocal_sequences = 0 + print("No candidate region found") if refBool == True: - print("No candidate region found") continue + else: + reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path, mode) @@ -766,7 +771,8 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: - print("No ortholog fulfilled the reciprocity criteria") + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '' and refBool == True: continue else: From 3f46b83ad88816c741779f6a378e5f4ace1a6a11 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:15:41 +0200 Subject: [PATCH 076/192] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 07dbe83..09ac05e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -747,6 +747,7 @@ def main(): if refBool == True: continue else: + taxa = fdog_ref_species reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") @@ -765,6 +766,7 @@ def main(): continue else: reciprocal_sequences = 0 + taxa = fdog_ref_species else: reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) From b5924a81f6784730b6863c298025aafee79614ae Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:21:39 +0200 Subject: [PATCH 077/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 09ac05e..ca89dd1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -747,7 +747,7 @@ def main(): if refBool == True: continue else: - taxa = fdog_ref_species + taxa = [fdog_ref_species] reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") @@ -766,7 +766,7 @@ def main(): continue else: reciprocal_sequences = 0 - taxa = fdog_ref_species + taxa = [fdog_ref_species] else: reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) From 490f43cc42b3e8122441f12dcded7cb8f1a26a7b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:05:58 +0200 Subject: [PATCH 078/192] added function to clean up .domain files --- fdog/fDOGassembly.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ca89dd1..3c837dd 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -384,10 +384,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): - print(output) - print(refBool) - print(core_fasta) - print(species_list) output_file = open(output, "a+") if refBool == False: @@ -489,16 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked -def changes_for_fas(file, header, mode): - #def replace_first_line( src_filename, target_filename, replacement_line): - f_in = open(file) - first_line, remainder = f.readline(), f.read() - line = first_line.split("|")[0] - f_in.close() - f_out = open(file + "s","w") - f_out.write(line + "\n") - f_out.write(remainder) - f_out.close() +def clean_fas(path): + file = open(path, "r") + lines = file.readlines() + file.close() + file.open(path,"w") + + for line in lines: + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + file.write(new_line) class Logger(object): def __init__(self, file): @@ -811,6 +808,8 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) + clean_fas(group + "_forward.domains") + clean_fas(group + "_reverse.domains") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 07c693d795385bfd0d1941271e8228aa6c71c240 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:15:11 +0200 Subject: [PATCH 079/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3c837dd..d50bfe8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -808,8 +808,8 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) - clean_fas(group + "_forward.domains") - clean_fas(group + "_reverse.domains") + clean_fas(out + group + "_forward.domains") + clean_fas(out + group + "_reverse.domains") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 3d804229698eb08161c2edd537dec774f6470a70 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:25:05 +0200 Subject: [PATCH 080/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d50bfe8..75e10f1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -489,7 +489,7 @@ def clean_fas(path): file = open(path, "r") lines = file.readlines() file.close() - file.open(path,"w") + file = open(path,"w") for line in lines: long_id, remain = line.split("#") From acdb6fe068a7d221d780d651660d6da6c45a830c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:47:07 +0200 Subject: [PATCH 081/192] testing --- fdog/fDOGassembly.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 75e10f1..23359d3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -401,7 +401,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: if entry_candidate.id == sequenceIds[0]: - print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -485,16 +484,22 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked -def clean_fas(path): +def clean_fas(path, file_type): file = open(path, "r") lines = file.readlines() file.close() file = open(path,"w") for line in lines: - long_id, remain = line.split("#") - id = long_id.split("|")[0] - new_line = id + "#" + remain + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t") + id = long_id.split("|")[0] + new_line = id + "\t" + remain + file.write(new_line) class Logger(object): @@ -808,8 +813,9 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) - clean_fas(out + group + "_forward.domains") - clean_fas(out + group + "_reverse.domains") + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 38aca29591e1a54430a7e395bad343657a13aef8 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:58:34 +0200 Subject: [PATCH 082/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 23359d3..a021483 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -496,7 +496,7 @@ def clean_fas(path, file_type): id = long_id.split("|")[0] new_line = id + "#" + remain else: - long_id, remain = line.split("\t") + long_id, remain = line.split("\t", 1) id = long_id.split("|")[0] new_line = id + "\t" + remain From f46cdc0e65616bf95a13f8e69268092584399419 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 11 May 2021 15:59:56 +0200 Subject: [PATCH 083/192] improve user output --- fdog/fDOGassembly.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a021483..d5184b2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -149,8 +149,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug name = key + "_" + str(counter) # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - starting_subprocess(cmd, mode) + #print(cmd) + starting_subprocess(cmd, 'silent') # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" starting_subprocess(cmd, mode) @@ -396,6 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: + #print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: @@ -677,7 +678,7 @@ def main(): print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") @@ -689,7 +690,7 @@ def main(): starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path #print(cmd) - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') print("block profile is finished \n") searchBool = False @@ -798,7 +799,7 @@ def main(): cmd = 'mkdir ' + tmp_path + 'anno_dir' starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': @@ -812,7 +813,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') From b662346b1a96358729427f630685957e60058ad5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:20:19 +0200 Subject: [PATCH 084/192] fdog.assembly started with fDOG is always silent --- .DS_Store | Bin 6148 -> 6148 bytes fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/bin/oneSeq.pl | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.DS_Store b/.DS_Store index 824f712743a6414728f27d69a840e656771a9cdf..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644 GIT binary patch delta 34 pcmZoMXffDuo<-Ei&`3wY(8#P-N1@u#$lOpz!PLTh^Ai>YVF0V92}l3{ delta 34 ncmZoMXffDuo<-Eqz(hyE(Acn6N1@u#2*fcrG1>ftML`$)ct%kpLn9ppLnE_V9ffK`BXdI?1yc+2%?*M}8727{ z(imJA{2AOC5*dntFqI*fA&DV}p>ncXx^l*k4q0K|bT3IG5A delta 49 zcmV-10M7q}K!iY$PXQCLP`eKS6SE8uUjdWD5=yhR5pV&suM=1Vk${S`2N?DQvxgOc H0+E1+rmhjk diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7139af7..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") From a751205c0bdc4832cb26a8955b3a04e05f332046 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:41:04 +0200 Subject: [PATCH 085/192] testing --- fdog/bin/oneSeq.pl | 2 +- fdog/fDOGassembly.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7e8a248..7139af7 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d5184b2..8884fba 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -396,7 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: - #print(sequenceIds) + print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: From eb9f585088bad8b476c02add5fbd8a78bead8c84 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:54:32 +0200 Subject: [PATCH 086/192] testing output --- fdog/fDOGassembly.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8884fba..4e9e6be 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -396,7 +396,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: - print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: @@ -800,6 +799,11 @@ def main(): starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + + #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': From bb3c148b46b874865e67314a88b07b443c9dcfeb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:00:32 +0200 Subject: [PATCH 087/192] testing --- fdog/fDOGassembly.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4e9e6be..1b84a1e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -631,6 +631,10 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' + else: + if out[-1] != "/": + out = out + "/" + try: f = open(out + "/fdog.log", "a+") From be2b9d4b3b1ea5a5e8ba214ff0c5d5754a4f82e8 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:01:43 +0200 Subject: [PATCH 088/192] testing --- fdog/mergeAssemblyOutput.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index ea6e084..11d5c36 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,6 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) + os.system("rm *.tsv") if phyloprofile: phyloprofile.close() From 6fbd5aadcc9ee3151ddfd1fb75a8e882b83bf1b2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:06:19 +0200 Subject: [PATCH 089/192] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b84a1e..de9f343 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -803,9 +803,9 @@ def main(): starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') - clean_fas(out + group + "_forward.domains", 'domains') - clean_fas(out + group + "_reverse.domains", 'domains') - clean_fas(out + group + ".phyloprofile", 'phyloprofile') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') #if we searched in more than one Taxon and no ortholog was found From 34d683c8aaa9529344b070a0fdccaebce77a10f3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:07:48 +0200 Subject: [PATCH 090/192] testing --- fdog/mergeAssemblyOutput.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 11d5c36..79a1306 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm *.tsv") + os.system("rm " + directory + "/*.tsv") if phyloprofile: phyloprofile.close() From f9504745c247595c867669695d8b302fd30571a7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:14:29 +0200 Subject: [PATCH 091/192] testing --- fdog/bin/oneSeq.pl | 2 +- fdog/mergeAssemblyOutput.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7139af7..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 79a1306..6c865a1 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm " + directory + "/*.tsv") + os.system("rm " + directory + "/'*.tsv'") if phyloprofile: phyloprofile.close() From 0b129a293cd1fcf30770883e1796bd830f8e4dee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:28:05 +0200 Subject: [PATCH 092/192] removing automatically .tsv files if existing --- fdog/mergeAssemblyOutput.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 6c865a1..1606b1d 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,8 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm " + directory + "/'*.tsv'") + elif infile.endswith('.tsv'): + os.remove(directory + '/' + infile) if phyloprofile: phyloprofile.close() From 6c6b1258f1376b0cff530e1492c7a40200946915 Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 31 May 2021 15:35:20 +0200 Subject: [PATCH 093/192] Fdog goes assembly (#8) * testing * shorten long header for addTaxon, check for long headers in oneseq and checkData * testing * testing * testing * changed path in hamstr.pl to current directory * changed path in hamstr.pl to current directory * testing * testing * testing * testing * testing * testing * bug fix * bug fix * fixed error mapping ID file not found * testing * testing * testing * test * test * testing * testing * testing * testing * fDOGassembly is working on complete assembly_dir * bug fix * bug fix * enabled option -filter for blastp search * bug fix fasoff * testing --strict option * bug fix in --strict option, output is corrected * bug fix in --checkCoorthologsRef * bug fix * clean up * bug fix * adapted handling of variable dataPath * testing * testing * testing * testing * test * test * test * test * test * test * testing * bug fix assemblyDir * testing * testing * testing search taxa * test * enable --searchTaxa option in fdog.assembly * bug fix * testing * testing --searchTaxa adaption * testing * test * test * write debug files to output dir * skip fa.mapping while checking genome_dir * testing * bug fix * testing * bug fix * bug fix * path fix in augustus_ppx * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * testing * testing * added new python script to merge Assembly output from the same Gene but different searchTaxa * added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa * bug fix * corrected fdog.mergeAssembly call * testing * testing * testing * test * moved fdog.mergeAssembly call to another place * testing * testing * testing * testing * testing * testing * corrected fdog.mergeAssembly call * testing * testing * testing * testing * test * disable weight_dir check if option --assembly is used * adapted fdog.assembly call * adapted calcFAS call to deactivate .tsv output * testing * testing * bug fix in function backward search used with option --strict * testing new added option --silent * added more checks to fdogs.run * bug fix * testing * testing * testing * bug fix * bug fix * testing * testing silent mode * testing --silent * symlinks for fasta36 input; improved fdogs.run according to #5 * testing * testing * testing * testing * tetsing * testing * testing * testing * testing * testing * testing * testing * test * test * testing * testing new function to identify coorthologs * testing * testing * testing * testing * testing * testing * testing * testing * testing * finished function coorthologs * bug fix runSingle.py * cleaning output * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * bug fix if augutus can't idetify a gene at a candidate region * testing * bug fix * bug fix * cleaning up * testing * testing * testing * testing * bug fix in merge function, regions in minus strand were not merged correctly * testing * testing * testing * testing * testing * bug fix * testing * testing * testing * testing * testing * clean up * testing * testing * testing * testing * bug fix * testing new tblastn call * testing * testing * testing * testing * testing * code clean up * clean up code * clean up * clean up * reduce output * clean up code * check augustus * testing * adding option to recognize if co-ortholog or not in header of the extended.fa * testing * testing * testing * testing * testing * testing * testing * testing * added function starting_subprocess() to handle call of extern tools more easily * added augustus to dependencies * testing * bug fix * testing * testing * testing * testing * testing * testing * testing * testing * testing * added function to clean up .domain files * testing * testing * testing * testing * improve user output * fdog.assembly started with fDOG is always silent * testing * testing output * testing * testing * testing * testing * testing * removing automatically .tsv files if existing Co-authored-by: trvinh --- .DS_Store | Bin 6148 -> 6148 bytes .github/workflows/github_build.yml | 51 ++ .gitignore | 3 +- .travis.yml | 23 +- README.md | 3 +- fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/addTaxa.py | 3 +- fdog/addTaxon.py | 16 +- fdog/bin/hamstr.pl | 103 +-- fdog/bin/oneSeq.pl | 340 ++++++---- fdog/checkData.py | 11 +- fdog/data/.DS_Store | Bin 8196 -> 6148 bytes fdog/fDOGassembly.py | 837 ++++++++++++++++++++++++ fdog/fdog_goes_assembly/.DS_Store | Bin 6148 -> 0 bytes fdog/fdog_goes_assembly/fDOGassembly.py | 209 ------ fdog/mergeAssemblyOutput.py | 124 ++++ fdog/runMulti.py | 207 ++++-- fdog/runSingle.py | 149 ++++- fdog/setup/setup.sh | 3 +- fdog/setup/setup_conda.sh | 6 +- fdog/setupfDog.py | 6 +- setup.py | 10 +- 22 files changed, 1619 insertions(+), 485 deletions(-) create mode 100644 .github/workflows/github_build.yml create mode 100644 fdog/fDOGassembly.py delete mode 100644 fdog/fdog_goes_assembly/.DS_Store delete mode 100644 fdog/fdog_goes_assembly/fDOGassembly.py create mode 100644 fdog/mergeAssemblyOutput.py diff --git a/.DS_Store b/.DS_Store index c84405d9d29ae54bb91cc188eb50403196c8adc3..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644 GIT binary patch delta 171 zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7i&&T#IVSTk*)yJ+?8DTcC0Si1vqhM-bUR%q_A+Bm@>zR;SSyf$ATQ_U+S0;JJ*^@t{E_)P^JIPz RM-E1y98d|v<_M8B%m6dnE297a delta 121 zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3PWEAG(3GgIHZ(BNQ7|+%tkqGd zwlp%(Q7|*KtgYqb5LY#{^-RdEtg5c5t(!Ud8;d;StjS`m4>yZ(h_YvOFr1p~CD@=PSzT>pXr!ZH zXk=Eaqfl*VWNxUVU}|ArTg%BIu4-uOnUGsqRb5kCH*2zwu)O$ephjkfB!*0eyt3e; zyqx^Jbf5_1k( y8Ck>T1;T=i+^k?@OBhNS3MShL-VjIPBRgjEI*|aT&Fm82SSFtlT8u}PFB1SkZfHaR delta 203 zcmZp1XmOa}&nUDpU^hRb&}1Hglao&imQ21Za&|JWu!OP@LmER0Lq0<~Lp%`YGo&&U z16hecvKYwDWk_PknS4=H#E4I})m1V(2c{%xc=|GbiH?I*E cWZb+<)R$>v11IBVc8PB+o7uz|fjl-w0N=DXc>n+a diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index 1c83bb5..d392c8c 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -95,7 +95,7 @@ def runAddTaxon(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.1' + version = '0.0.5' parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -125,6 +125,7 @@ def main(): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') with open(pathconfigFile) as f: outPath = f.readline().strip() + outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa oldFAS = args.oldFAS diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index e09f1e4..fe0a810 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -77,13 +77,13 @@ def runBlast(args): subprocess.call([blastCmd], shell = True) except: sys.exit('Problem with running %s' % blastCmd) - fileInGenome = "%s/genome_dir/%s/%s.fa" % (outPath, specName, specName) + fileInGenome = "../../genome_dir/%s/%s.fa" % (specName, specName) fileInBlast = "%s/blast_dir/%s/%s.fa" % (outPath, specName, specName) if not Path(fileInBlast).exists(): os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.2' + version = '0.0.5' parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -115,6 +115,7 @@ def main(): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') with open(pathconfigFile) as f: outPath = f.readline().strip() + outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa ver = str(args.verProt) @@ -152,10 +153,13 @@ def main(): seq = str(inSeq[id].seq) # check ID id = re.sub('\|', '_', id) - if len(id) > 80: - # modIdIndex = modIdIndex + 1 - # id = specName + "_" + str(modIdIndex) + oriId = id + if len(id) > 30: + modIdIndex = modIdIndex + 1 + id = specName + "_" + str(modIdIndex) longId = 'yes' + with open(specFile + '.mapping', 'a') as mappingFile: + mappingFile.write('%s\t%s\n' % (id, oriId)) if not id in tmpDict: tmpDict[id] = 1 else: @@ -184,7 +188,7 @@ def main(): cf.close() # warning about long header if longId == 'yes': - print('\033[91mWARNING: Headers are longer than 80 characters. It could cause some troubles!\033[0m') + print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) else: print(genomePath + '/' + specName + '.fa already exists!') diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl index 37ae73a..7ff125e 100755 --- a/fdog/bin/hamstr.pl +++ b/fdog/bin/hamstr.pl @@ -193,9 +193,11 @@ ## 13.07.2020 (v13.3.0 - vinh) solved problem when gene ID contains PIPE ## 22.07.2020 (v13.4.0 - vinh) moved tmp blast files to output folder and delete them when finished ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef +## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name +## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory ######################## start main ########################################### -my $version = "HaMStR v.13.4.1"; +my $version = "HaMStR v.13.4.4"; ######################## checking whether the configure script has been run ### my $configure = 0; if ($configure == 0){ @@ -214,8 +216,9 @@ my $filter = 'F'; # low complexity filter switch. Default 'on'. Set of 'F' to turn off permanently. my $eval_blast = 10; # default evalue cutoff for the blast search ########## EDIT THE FOLLOWING LINES TO MODIFY DEFAULT PATHS ################### -my $path = abs_path(dirname(__FILE__)); -$path =~ s/\/bin//; +# my $path = abs_path(dirname(__FILE__)); +# $path =~ s/\/bin//; +my $path = getcwd; my $hmmpath = "$path/core_orthologs"; #path where the hmms are located my $blastpath = "$path/blast_dir"; #path to the blast-dbs my $outpath = '.'; @@ -223,10 +226,10 @@ my $hmm_dir = 'hmm_dir'; my $fa_dir = 'fa_dir'; ############################## -my $termios = new POSIX::Termios; $termios->getattr; -my $ospeed = $termios->getospeed; -my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; +# my $termios = new POSIX::Termios; $termios->getattr; +# my $ospeed = $termios->getospeed; +# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; +# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; ############################## Variables ############## my $fileobj; @@ -322,16 +325,16 @@ } ## help message my $helpmessage = " -${bold}YOU ARE RUNNING $version on $hostname$norm +YOU ARE RUNNING $version on $hostname This program is freely distributed under a GPL. Copyright (c) GRL limited: portions of the code are from separate copyrights -\n${bold}USAGE:${norm} hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] +\nUSAGE: hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] -${bold}OPTIONS:$norm +OPTIONS: -${bold}REQUIRED$norm +REQUIRED -sequence_file=<> path and name of the file containing the sequences hmmer is run against. -hmmset=<> @@ -359,7 +362,7 @@ set this flag if you are searching in protein sequences. Note, if neither the -est nor the -protein flag is set, HaMStR will guess the sequence type. -${bold}USING NON-DEFAULT PATHS$norm +USING NON-DEFAULT PATHS -blastpath=<> Lets you specify the absolute or relative path to the blast databases. DEFAULT: $blastpath @@ -368,7 +371,7 @@ -outpath=<> You can determine the path to the HaMStR output. Default: current directory. -${bold}ADDITIONAL OPTIONS$norm +ADDITIONAL OPTIONS -append set this flag if the output should be appended to the files *.out and *_cds.out. This becomes relevant when running @@ -412,7 +415,7 @@ -hmm Option to provide only a single hmm to be used for the search. Note, this file has to end with .hmm --intron=<${bold}k${norm}eep|${bold}m${norm}ask|${bold}r${norm}emove> +-intron= Specify how to deal with introns that may occur in transcript sequences. Default: keep - Introns will be retained in the transcript but will be identified by lower case letters. -longhead @@ -512,7 +515,7 @@ ## 1) check if all information is available to run HaMStR ($check, @log) = &checkInput(); if ($check == 0) { - print "\n\n${bold}There was an error running $version$norm\n\n"; + print "\n\nThere was an error running $version\n\n"; print join "\n", @log; exit; } @@ -783,11 +786,11 @@ sub checkInput { my @coresets = (`ls $hmmpath`); chomp @coresets; if (scalar(@coresets > 0)){ - print "\n${bold}THE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:${norm}\n\n"; + print "\nTHE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:\n\n"; for (my $i = 0; $i < @coresets; $i++){ my @available = qw(); my @unavailable = qw(); - print "\n${bold}$coresets[$i]${norm}\n\n"; + print "\n$coresets[$i]\n\n"; my @refspec = `head -n 20 $hmmpath/$coresets[$i]/$coresets[$i].fa |$grepprog '>' |cut -d '|' -f 2 |sort |uniq`; chomp @refspec; for (my $j = 0; $j < @refspec; $j++){ @@ -807,7 +810,7 @@ sub checkInput { } } else { - print "\n${bold}NO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!${norm}\n\n"; + print "\nNO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!\n\n"; } print "\n\n"; exit; @@ -873,12 +876,17 @@ sub checkInput { } $dbfile =~ s/.*\///; - $dbfile_short = $dbfile; - $dbfile_short =~ s/\..*//; + # $dbfile_short = $dbfile; + # $dbfile_short =~ s/\..*//; + my @dbfileTMP = split(/\./, $dbfile); pop @dbfileTMP; + $dbfile_short = join(".", @dbfileTMP); if ($central) { $dboutpath = $dbpath; # print "setting dboutpath to $dboutpath"; } + + # print "HERERERERERERERERER $dbfile #################\n"; + # print "THENNNNNNNNNNNNNNNN $dbfile_short #################\n"; ## ## 0) Check for presence of the file with the sequences that should be hamstered if (-e "$dbpath/$dbfile") { @@ -886,7 +894,7 @@ sub checkInput { } else { #the provided infile does not exist: - push @log, "${bold}FATAL:${norm} The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; + push @log, "FATAL: The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; $check = 0; return ($check, @log); } @@ -952,7 +960,7 @@ sub checkInput { push @log, "Translated file already exists, using this one"; } if (! -e "$dboutpath/$dbfile") { - push @log, "${bold}FATAL:${norm} The translation of $dbfile_base failed. Check the script translate.pl"; + push @log, "FATAL: The translation of $dbfile_base failed. Check the script translate.pl"; print "failed\n"; $check = 0; } @@ -965,7 +973,7 @@ sub checkInput { push @log, "\nCHECKING FOR PROGRAMS\n"; printOUT("checking for the blast program:\t"); if (`which $blast_prog` =~ / no /) { - push @log, "${bold}FATAL:${norm} could not execute $blast_prog. Please check if this program is installed and executable"; + push @log, "FATAL: could not execute $blast_prog. Please check if this program is installed and executable"; print "failed\n"; $check = 0; } @@ -979,12 +987,12 @@ sub checkInput { printOUT("checking for hmmsearch:\t"); my $hmmcheck = `$prog -h |$grepprog -c 'HMMER 3'`; if (! `$prog -h`) { - push @log, "${bold}FATAL:${norm} could not execute $prog. Please check if this program is installed and executable"; + push @log, "FATAL: could not execute $prog. Please check if this program is installed and executable"; print "failed: $prog is not installed or not executable\n"; $check = 0; } elsif ($hmmcheck != 1) { - push @log, "${bold}FATAL:${norm} It seems that $prog is not from the HMMER 3 package. Please check!"; + push @log, "FATAL: It seems that $prog is not from the HMMER 3 package. Please check!"; print "failed: $prog is not from the HMMER 3 package\n"; $check = 0; } @@ -996,14 +1004,14 @@ sub checkInput { if ($check_genewise) { printOUT("checking for genewise:\t"); if (! `genewise -help`) { - push @log, "${bold}FATAL:${norm} Could not execute genewise. Please check if this program is installed and executable"; + push @log, "FATAL: Could not execute genewise. Please check if this program is installed and executable"; print "failed: genewise is not executable\n"; $check = 0; } else { my $gwcheck = `echo \$WISECONFIGDIR`; if (length($gwcheck) < 1) { - push @log, "${bold}FATAL:${norm} The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. + push @log, "FATAL: The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. Please consult the installation manual for genewise and set this variable"; print "failed: the environmental variable WISECONFIGDIR has not been set.\n"; $check = 0; @@ -1014,14 +1022,14 @@ sub checkInput { } } else { - push @log, "${bold}GENEWISE-CHECK skipped:${norm} The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; + push @log, "GENEWISE-CHECK skipped: The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; } ## 4) Check for presence of the directory structure push @log, "\nCHECKING FOR HMMs\n"; printOUT("checking for presence of the hmm files:\t"); if ( ! defined $hmmset or ! -e "$hmmpath/$hmmset") { - push @log, "${bold}FATAL:${norm} You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; + push @log, "FATAL: You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; print "failed\n"; $check = 0; } @@ -1033,7 +1041,7 @@ sub checkInput { ## 4b) check for the presence of the hmm-files and the fasta-file if (!(-e "$hmm_dir")) { - push @log, "${bold}FATAL:${norm} Could not find $hmm_dir"; + push @log, "FATAL: Could not find $hmm_dir"; print "failed\n"; $check = 0; } else { @@ -1043,7 +1051,7 @@ sub checkInput { ### check for the presence of all hmms for (my $k = 0; $k < @hmms; $k++) { if (! -e "$hmm_dir/$hmms[$k]") { - push @log, "${bold}FATAL:${norm} $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; + push @log, "FATAL: $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; $check = 0; last; } else { @@ -1073,7 +1081,7 @@ sub checkInput { } } else { - push @log, "${bold}FATAL:${norm} Please provide path and name of fasta file containing the core-ortholog sequences"; + push @log, "FATAL: Please provide path and name of fasta file containing the core-ortholog sequences"; $check = 0; print "failed\n"; } @@ -1086,7 +1094,7 @@ sub checkInput { $taxon_check = 2; } else { - push @log, "${bold}FATAL:${norm} No taxon_file found. Please provide a global taxon name using the option -taxon"; + push @log, "FATAL: No taxon_file found. Please provide a global taxon name using the option -taxon"; print "failed\n"; $check = 0; } @@ -1094,7 +1102,7 @@ sub checkInput { push @log, "\nCHECKING FOR REFERENCE TAXON\n"; printOUT("checking for reference species and blast-dbs:\t"); if (!(defined $refspec_string) and (! defined $strict and ! defined $relaxed)) { - push @log, "${bold}FATAL:${norm} Please provide a reference species for the reblast!"; + push @log, "FATAL: Please provide a reference species for the reblast!"; print "failed\n"; $check = 0; } @@ -1146,7 +1154,7 @@ sub checkInput { printOUT("succeeded\n"); } else { - push @log, "${bold}FATAL:${norm} please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; + push @log, "FATAL: please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; print "$blastpathtmp failed\n"; $check = 0; } @@ -1174,7 +1182,7 @@ sub checkInput { push @log, "\tinfile ready"; } else { #the provided reference fasta file does not exist or link to file does not exist: - push @log, "${bold}FATAL:${norm} FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; + push @log, "FATAL: FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; $check = 0; return ($check, @log); } @@ -1241,7 +1249,7 @@ sub checkInput { printOUT("checking for low complexity filter setting:\t"); $filter =~ tr/ft/FT/; if ($filter ne 'T' and $filter ne 'F') { - push @log, "${bold}FATAL:${norm} Filter is set to $filter. Please set the low complexity filter either to F or T."; + push @log, "FATAL: Filter is set to $filter. Please set the low complexity filter either to F or T."; print "low complexity filter check failed\n"; $check = 0; } @@ -1283,12 +1291,10 @@ sub checkInput { `rm -rf "$fa_dir_neu"`; `mkdir "$fa_dir_neu"`; } - if (!(-d "$tmpdir")) { - `mkdir "$tmpdir"`; - } - elsif (-d "$tmpdir" and $cleartmp) { + mkdir "$tmpdir" unless -d "$tmpdir"; + if (-d "$tmpdir" and $cleartmp) { `rm -rf "$tmpdir"`; - `mkdir "$tmpdir"`; + mkdir "$tmpdir" unless -d "$tmpdir"; } } ## 14) determin whether or not the -representative flag has been set @@ -1401,23 +1407,23 @@ sub check4reciprocity { my $suc = 0; # keeps track of success for a single taxon if ($checkCoRef == 0) { ## the user does not want to check further in case that id of best blast hit and of reference species differ - printOUT("core_orthologs: ", join "\t", @original_ids , "\n"); + printOUT("core_orthologs: @original_ids\n"); ## now loop through the best hits with the same score and check whether ## among these I find the same seq as in $original my $i = 0; while ($suc == 0 and $i <@$hits) { - printOUT("blast-hit: $hits->[$i]->{name}"); + printOUT("blast-hit: $hits->[$i]->{name}\n"); ## now loop through all the refspec-sequences in the hmm file; this is the case when co-orthologs have been determine in the core-ortholog my $j = 0; while ($suc == 0 and $j < @original_ids) { if ($original_ids[$j] eq $hits->[$i]->{name}) { - printOUT("\thitting\n"); + printOUT("hitting $original_ids[$j]\n"); $refspec_final->[$k]->{hit} = $j; $suc = 1; $relaxed_suc = 1; } else { - printOUT("\nnot hitting $original_ids[$j]\n"); + printOUT("not hitting $original_ids[$j]\n"); $j ++; } if ($suc == 1) { @@ -1468,7 +1474,7 @@ sub check4reciprocity { } ## print distances (debug mode) if ($debug){ - my $distDebugFile = $path . "/output/" . $taxon_global . ".debug.dist"; + my $distDebugFile = $outpath . "/" . $taxon_global . ".debug.dist"; #$path . "/output/" . $taxon_global . ".debug.dist"; unless (-e $distDebugFile){ open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; print $DISTDEBUG "hmmset\trefid\tbestid\tqueryid\tqhdist\trhdist\n"; @@ -2024,9 +2030,8 @@ sub determineRefspecFinal { my $ac = 0; for (my $i = 0; $i < @refspec; $i++) { $fafile =~ s/\|/\\\|/g; - @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile |$sedprog -e "s/.*$refspec[$i]\|//"`; + @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile | grep -v "^\-\-\$" |$sedprog -e "s/.*$refspec[$i]\|//"`; chomp @original; - if (@original > 0) { $refspec_final->[$ac]->{refspec} = $refspec[$i]; $refspec_final->[$ac]->{searchdb} = "$blastpath/$refspec[$i]/$refspec[$i]" . $blastapp; diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 61cae86..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -121,9 +121,15 @@ ## Modified 22. Sep 2020 v2.2.1 (Vinh) - make sure that seed sequence always at the beginning of extended.fa output ## Modified 23. Sep 2020 v2.2.3 (Vinh) - use full taxonomy name instead of abbr taxon name for LOG ## Modified 01. Dec 2020 v2.2.4 (Vinh) - fixed bug while creating final extended.fa (and replaced grep and sed by bioperl) +## Modified 16. Feb 2021 v2.2.5 (Vinh) - core compilation works with fasoff +## Modified 18. Feb 2021 v2.2.6 (Vinh) - fixed searchTaxa and coreTaxa options +## Modified 19. March 2021 v2.2.7 (Vinh) - check for long sequence ID +## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir +## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore +## - solved problem with long input path for fasta36 tools ############ General settings -my $version = 'oneSeq v.2.2.4'; +my $version = 'oneSeq v.2.2.9'; ##### configure for checking if the setup.sh script already run my $configure = 0; if ($configure == 0){ @@ -133,10 +139,10 @@ my $hostname = `hostname`; chomp $hostname; ############# -my $termios = new POSIX::Termios; $termios->getattr; -my $ospeed = $termios->getospeed; -my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; +# my $termios = new POSIX::Termios; $termios->getattr; +# my $ospeed = $termios->getospeed; +# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; +# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; #### Paths my $path = abs_path(dirname(__FILE__)); $path =~ s/\/bin//; @@ -166,7 +172,7 @@ my $blast_prog = 'blastp'; my $outputfmt = 'blastxml'; my $eval_blast_query = 0.0001; -my $filter = 'T'; +my $filter = 'F'; # default for blastp my $annotation_prog = "annoFAS"; my $fas_prog = "calcFAS"; my $fdogFAS_prog = "fdogFAS"; @@ -197,6 +203,7 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; +my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -300,6 +307,15 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; +#variables for fdog_goes_assembly +my $assembly; +my $augustusRefSpec; +my $avIntron; +my $lengthExtension; +my $assemblyPath; +my $searchTool = 'blast'; +my $matrix = 'blosum62'; +my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -361,7 +377,15 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa + "searchTaxa=s" => \$searchTaxa, + "assembly" => \$assembly, + "assemblypath=s" => \$assemblyPath, + "augustusRefSpec=s" => \$augustusRefSpec, + "avIntron=s" => \$avIntron, + "lengthExtension=s" => \$lengthExtension, + "searchTool=s" => \$searchTool, + "scoringmatrix=s" => \$matrix, + "dataPath=s" => \$dataPath ); $outputPath = abs_path($outputPath); @@ -373,6 +397,8 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; +$dataPath = abs_path($dataPath)."/"; +$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { @@ -381,7 +407,7 @@ initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; - if (!defined $coreex) { + if (!defined $coreex && !defined $assembly) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -464,7 +490,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support){ +if ($fas_support && !$assembly){ if (!$weightPath) { createWeightFolder(); } @@ -473,7 +499,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex) { +if (!$coreex && !$assembly) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -562,11 +588,14 @@ } } printDebug("The maximum alignmentscore is: $maxAlnScore"); + if ($maxAlnScore == 0) { + die("Maximum alignment score is Zero! Something went wrong with fasta36 functions!\n") + } clearTmpFiles(); my $addedTaxon = getBestOrtholog(); my $addedTaxonName = getTaxonName($addedTaxon); - print "Added TAXON: $addedTaxon\_$addedTaxonName\n"; + print "Added TAXON: $addedTaxon\t$addedTaxonName\n"; #if a new core ortholog was found if($addedTaxon ne "") { $hamstrSpecies = $hamstrSpecies . "," . $addedTaxon; @@ -608,12 +637,17 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - $taxaPath = $genome_dir; + if (!$assembly){ + $taxaPath = $genome_dir; + } + else{ + $taxaPath = $assembly_dir; + } my @searchTaxa; - unless($groupNode) { - @searchTaxa = keys %taxa; - } else { - unless ($searchTaxa) { + unless ($searchTaxa) { + unless($groupNode) { + @searchTaxa = keys %taxa; + } else { # %taxa = getTaxa(); # print "GET TAXA TIME: ", roundtime(gettime() - $startTmp),"\n"; my $tree = getTree(); @@ -629,11 +663,11 @@ foreach (get_leaves($tree)) { push(@searchTaxa, @{$_->name('supplied')}[0]); } - } else { - open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; - @searchTaxa = ; - close (SEARCH); } + } else { + open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; + @searchTaxa = ; + close (SEARCH); } # print "PREPARE TIME: ", roundtime(gettime() - $startTmp),"\n"; @@ -645,15 +679,82 @@ foreach (sort @searchTaxa) { chomp(my $searchTaxon = $_); my $pid = $pm->start and next; + if ($coreex) { + $db = Bio::DB::Taxonomy->new(-source => 'flatfile', + -nodesfile => $idx_dir . 'nodes.dmp', + -namesfile => $idx_dir . 'names.dmp', + -directory => $idx_dir); + $db_bkp = $db; + } my $searchTaxonName = getTaxonName($searchTaxon); if (defined($searchTaxonName)) { unless ($silent) { print $searchTaxon, "\t", $searchTaxonName, "\n"; } else { - print $searchTaxonName, "\n"; + unless ($searchTaxonName eq "Unk") { + print $searchTaxonName, "\n"; + } else { + print $searchTaxon, "\n"; + } + } + } + if ($assembly){ + $eval_blast = sprintf("%f", $eval_blast); + if ($seqFile ne "") { + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); + + if (defined $assemblyPath){ + push(@assembly_cmd, "--assemblyPath $assemblyPath") + } + if (defined $avIntron){ + push(@assembly_cmd, "--avIntron $avIntron "); + } + if (defined $lengthExtension){ + push(@assembly_cmd, "--lengthExtension $lengthExtension "); + } + if (!$autoclean){ + push(@assembly_cmd, "--tmp "); + } + if ($outputPath){ + push(@assembly_cmd, "--out $outputPath "); + } + if (defined $strict){ + push(@assembly_cmd, "--strict"); + } + if ($eval_blast){ + push(@assembly_cmd, "--evalBlast $eval_blast "); + } + if ($searchTool){ + push(@assembly_cmd, "--msaTool $aln "); + } + if (defined $checkcoorthologsref){ + push(@assembly_cmd, "--checkCoorthologsRef"); + } + if ($searchTool){ + push(@assembly_cmd, "--searchTool $searchTool"); + } + if ($matrix){ + push(@assembly_cmd, "--scoringmatrix $matrix"); + } + if ($coreOrthologsPath){ + push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); + } + if ($fasoff){ + push(@assembly_cmd, "--fasoff"); + } + if ($searchTaxon){ + push(@assembly_cmd, "--searchTaxon $searchTaxon"); + } + if ($filter){ + push(@assembly_cmd, "--filter $filter"); + } + printDebug(@assembly_cmd); + system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; } } + else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); + } $pm->finish; } $pm->wait_all_children; @@ -661,8 +762,8 @@ push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; -## Evaluation of all orthologs that are predicted by the final run -if(!$coreOnly){ + +if(!$coreOnly && !$assembly){ my $fasStTime = gettime(); my $processID = $$; @@ -671,10 +772,10 @@ die "ERROR: Could not find $finalOutput\n"; } # check and add seed to final extended.fa if needed - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # BLABLABLABLA + addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support) { + if ($fas_support && !$assembly) { print "Starting the feature architecture similarity score computation...\n"; my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu"; unless ($countercheck) { @@ -687,12 +788,21 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; + if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } +if ($assembly){ + my $file_assembly_out; + $file_assembly_out = $outputPath . '/' . $seqName; + my $cmd_merge; + $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; + printDebug($cmd_merge); + system($cmd_merge); +} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -721,8 +831,12 @@ sub clearTmpFiles { } #clear all alignment files - my @files = glob("*.scorefile"); - foreach my $file (@files) { + my @scorefiles = glob("*.scorefile"); + foreach my $file (@scorefiles) { + unlink($file); + } + my @fastaInfiles = glob("*_fasta36.fa"); + foreach my $file (@fastaInfiles) { unlink($file); } } @@ -761,21 +875,19 @@ sub getCandicontent{ sub getCumulativeAlnScores{ chdir($coreOrthologsPath . $seqName); my $candidatesFile = $outputFa . ".extended"; - my $scorefile = $$ . ".scorefile"; + my $fileId = $$; + my $scorefile = $fileId . ".scorefile"; + my $fasta36file1 = $fileId . ".1_fasta36.fa"; + my $fasta36file2 = $fileId . ".2_fasta36.fa"; my %scores; + ######################## ## step: 1 - ## setup - ## set alignment command (glocal, local, or global) - #local local:local ssearch36 Smith-Waterman - #glocal global:local glsearch36 Needleman-Wunsch - #global global:global ggsearch36 Needleman-Wunsch - my $loclocCommand = "$localaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globlocCommand = "$glocalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globglobCommand = "$globalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; + ## set alignment parameters for fasta36 + my $fasta36cmd = $fasta36file1 . "\" \"" . $fasta36file2 . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; + ######################## ## step: 2 - ## setup ## candidates to hash ## %candicontent keeps info about all candidates (header and sequence) my %candicontent = getCandicontent(); @@ -784,11 +896,25 @@ sub getCumulativeAlnScores{ ## step: 3 ## get alignment scores chdir($coreOrthologsPath . $seqName); + symlink($outputFa, $fasta36file1); + symlink($candidatesFile, $fasta36file2); if ($glocal){ + #glocal global:local glsearch36 Needleman-Wunsch + my $globlocCommand = "$glocalaligner \"" . $fasta36cmd; + printDebug($globlocCommand); + # print $globlocCommand,"\n";<>; system($globlocCommand); }elsif ($global){ + #global global:global ggsearch36 Needleman-Wunsch + my $globglobCommand = "$globalaligner \"" . $fasta36cmd; + printDebug($globglobCommand); + # print $globglobCommand,"\n";<>; system($globglobCommand); }elsif ($local){ + #local local:local ssearch36 Smith-Waterman + my $loclocCommand = "$localaligner \"" . $fasta36cmd; + printDebug($loclocCommand); + # print $loclocCommand,"\n";<>; system($loclocCommand); } ######################## @@ -806,49 +932,7 @@ sub getCumulativeAlnScores{ ## Get the alinment scores for the current candidate file sub getAlnScores{ chdir($coreOrthologsPath . $seqName); - my $candidatesFile = $outputFa . ".extended"; - my $scorefile = $$ . ".scorefile"; - my %scores; - - ######################## - ## step: 1 - ## setup - ## set alignment command (glocal, local, or global) - #local local:local ssearch36 Smith-Waterman - #glocal global:local glsearch36 Needleman-Wunsch - #global global:global ggsearch36 Needleman-Wunsch - my $loclocCommand = "$localaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globlocCommand = "$glocalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globglobCommand = "$globalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - - ######################## - ## step: 2 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 3 - ## get alignment scores - chdir($coreOrthologsPath . $seqName); - if ($glocal){ - system($globlocCommand); - }elsif ($global){ - system($globglobCommand); - }elsif ($local){ - system($loclocCommand); - } - - ######################## - ## step: 4 - ## collect alignment score - ## keep track about min and max for each query/coreortholog vs candidate set - my $max = -10000000; - my $min = 10000000; - - %scores = cumulativeAlnScore($scorefile, \%candicontent); - + my %scores = getCumulativeAlnScores(); ## Normalize Alignment scores (unity-based) printDebug("Normalize alignment scores:\n"); foreach my $key (keys %scores){ @@ -885,8 +969,8 @@ sub getFasScore{ ## step: 2 ## get FAS score ## fas support: on/off + my @candidateIds = keys(%candicontent); if ($fas_support){ - my @candidateIds = keys(%candicontent); my ($name,$gene_set,$gene_id,$rep_id) = split(/\|/, $candidateIds[0]); unless (-e "$weightPath/$gene_set.json") { print "ERROR: $weightPath/$gene_set.json not found! FAS Score will be set as zero.\n"; @@ -898,6 +982,8 @@ sub getFasScore{ my @fasOutTmp = split(/\t/,$fasOutTmp); $fas_box{$candidateIds[0]} = $fasOutTmp[1]; } + } else { + $fas_box{$candidateIds[0]} = 1; } return %fas_box; } @@ -1123,10 +1209,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set - if ($coreex) { + ### check the presence of the pre-computed core set if options reuseCore or assembly is used + if ($coreex || $assembly) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1155,7 +1241,7 @@ sub checkOptions { ### end move up ### adding new routine to generate the input sequence if -reuseCore has been set if ($coreex) { - my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa`; + my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa | grep -v "^\-\-\$"`; chomp @refseq; unless ($silent) { print "$refseq[0]\n"; @@ -1197,7 +1283,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and !$coreex) { + while(!$minCoreOrthologs and (!$coreex and !$assembly)) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1212,10 +1298,12 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - $inputSeq = fetchSequence($seqFile, $dataDir); + if (!$assembly){ + $inputSeq = fetchSequence($seqFile, $dataDir); + } ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '') { + if($seqId eq '' && !$assembly) { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1230,6 +1318,9 @@ sub checkOptions { $refSpec = $besthit->{species}; my $details = "Evalue: " . $besthit->{evalue}; printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); + if(length("$seqName|$refSpec|$seqId") > 60) { + die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; + } if($seqId eq '') { print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; exit; @@ -1241,13 +1332,13 @@ sub checkOptions { print "Please specify a valid file with taxa for the core orthologs search\n"; exit; } - my @userTaxa = parseTaxaFile(); + my @userTaxa = parseTaxaFile($coreTaxa); my %newTaxa = (); foreach (@userTaxa) { $newTaxa{$_} = $taxa{$_}; } $newTaxa{$refSpec} = $refTaxa{$refSpec}; - %taxa = %newTaxa; + %refTaxa = %newTaxa; } if($group) { @@ -1334,14 +1425,14 @@ sub checkOptions { } } - my $node; - $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); - $node->name('supplied', $refSpec); - #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex) { + if (!$coreex and !$assembly) { + my $node; + #print "Testing coreex assembly\n"; + $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); + $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ $maxDist = 'no rank'; } @@ -1357,9 +1448,6 @@ sub checkOptions { $maxDist = parseInput($node, $in); print "You selected ". $maxDist . " as maximum rank\n\n"; } - } - $optbreaker = 0; - if (!$coreex){ while (!$minDist or (checkRank($minDist, $node) == 0)) { if ($optbreaker >= 3){ print "No proper minDist given ... exiting.\n"; @@ -1373,6 +1461,7 @@ sub checkOptions { print "You selected " . $minDist . " as minimum rank\n\n"; } } + $optbreaker = 0; #### checking in fas options if($fasoff){ @@ -1596,8 +1685,9 @@ sub getBestOrtholog { ## candidates alnScore is high enought, that it would be better with a fasScore of one ## -> evaluate if ($alnScores{$candiKey} > $rankScore * (1 + $distDeviation) - 1){ + %fas_box = getFasScore(); if (!$gotFasScore and $fas_support){ - %fas_box = getFasScore(); + # %fas_box = getFasScore(); $gotFasScore = 1; } ## get rankscore @@ -1622,8 +1712,9 @@ sub getBestOrtholog { } ## candidate has the same distance, as the last one and could be better, with a fasScore of one elsif (defined $hashTree{$newNoRankDistNode}{$key->id} and $alnScores{$candiKey} > $rankScore - 1){ + %fas_box = getFasScore(); if (!$gotFasScore and $fas_support){ - %fas_box = getFasScore(); + # %fas_box = getFasScore(); $gotFasScore = 1; } ## get rankscore @@ -1909,7 +2000,7 @@ sub getTaxonName { if (defined($taxon)) { return($taxon->scientific_name); } else { - return("Unk NCBI taxon for $taxAbbr"); + return("Unk"); } } @@ -2008,6 +2099,7 @@ sub runHamstr { print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n"; } } + # addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); } else { # add seed sequence to output extended.fa if no ortholog was found in refSpec if ($taxon eq $refSpec) { @@ -2054,11 +2146,13 @@ sub addSeedSeq { # get seed sequence and add it to the beginning of the fasta output open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); + my %idTmp; # used to check which seq has already been written to output while(my $seq = $seqio->next_seq) { my $id = $seq->id; if ($id =~ /$refSpec/) { + $idTmp{"$id|1"} = 1; print TEMP ">$id|1\n", $seq->seq, "\n"; - last; + #last; } } # then write other sequences @@ -2066,7 +2160,9 @@ sub addSeedSeq { while(my $seq = $seqio2->next_seq) { my $id = $seq->id; unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { - print TEMP ">$id\n", $seq->seq, "\n"; + unless ($idTmp{$id}) { + print TEMP ">$id\n", $seq->seq, "\n"; + } } } close(TEMP); @@ -2096,17 +2192,19 @@ sub parseInput { } ########################## sub parseTaxaFile { - open (INPUT, "<$coreTaxa") or die print "Error opening file with taxa for core orthologs search\n"; + my $coreTaxaFile = $_[0]; + open (INPUT, "<$coreTaxaFile") or die print "Error opening file with taxa for core orthologs search\n"; my @userTaxa; while() { my $line = $_; chomp($line); - if(!$taxa{$line}) { - print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; - exit; - } - else { - push(@userTaxa, $line); + if (length($line) > 0) { + if(!$taxa{$line}) { + print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; + exit; + } else { + push(@userTaxa, $line); + } } } close INPUT; @@ -2592,7 +2690,7 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1) { + if ($fasoff != 1 && !$assembly) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); @@ -2607,7 +2705,7 @@ sub initialCheck { sub getGenomeFile { my ($folder, $filename) = @_; - chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.tmp\"`); + chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.mapping\\|\\.tmp\"`); my $out = $faFile; chomp(my $link = `$readlinkprog -f $faFile`); if ($link ne "") { @@ -2641,23 +2739,23 @@ sub checkValidFolderName { ########################### sub helpMessage { my $helpmessage = " -${bold}YOU ARE RUNNING $version on $hostname$norm +YOU ARE RUNNING $version on $hostname This program is freely distributed under a GPL. Copyright (c) GRL limited: portions of the code are from separate copyrights -\n${bold}USAGE:${norm} oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] +\nUSAGE: oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] -${bold}OPTIONS:$norm +OPTIONS: -${bold}GENERAL$norm +GENERAL -h Invoke this help method -version Print the program version -${bold}REQUIRED$norm +REQUIRED -seqFile=<> Specifies the file containing the seed sequence (protein only) in fasta format. @@ -2677,7 +2775,7 @@ sub helpMessage { -coreOrth=<> Specify the number of orthologs added to the core set. -${bold}USING NON-DEFAULT PATHS$norm +USING NON-DEFAULT PATHS -outpath=<> Specifies the path for the output directory. Default is $outputPath; @@ -2690,7 +2788,7 @@ sub helpMessage { -weightpath=<> Specifies the path for the pre-calculated feature annotion directory. Default is $weightPath; -${bold}ADDITIONAL OPTIONS$norm +ADDITIONAL OPTIONS -append Set this flag to append the output to existing output files @@ -2777,7 +2875,7 @@ sub helpMessage { Set the alignment strategy during core ortholog compilation to glocal. -searchTaxa Input file containing list of search taxa. -${bold}SPECIFYING FAS SUPPORT OPTIONS$norm +SPECIFYING FAS SUPPORT OPTIONS -fasoff Turn OFF FAS support. Default is ON. @@ -2790,7 +2888,7 @@ sub helpMessage { -countercheck Set this flag to counter-check your final profile. The FAS score will be computed in two ways (seed vs. hit and hit vs. seed). -${bold}SPECIFYING EXTENT OF OUTPUT TO SCREEN$norm +SPECIFYING EXTENT OF OUTPUT TO SCREEN -debug Set this flag to obtain more detailed information about the programs actions diff --git a/fdog/checkData.py b/fdog/checkData.py index 59256bc..84310ac 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -70,6 +70,12 @@ def checkValidFasta(file): fasta = SeqIO.parse(f, 'fasta') if not any(fasta): return('notFasta') + else: + # check for long header + inSeq = SeqIO.to_dict((SeqIO.parse(open(file), 'fasta'))) + for id in inSeq: + if len(id) > 30: + return('longHeader') # check space or tab if any(s in f.read() for s in spaceChr): return('space') @@ -90,6 +96,7 @@ def checkValidSeqs(faFile): faSeq = SeqIO.parse(open(faFile),'fasta') for fa in faSeq: id, seq = fa.description, str(fa.seq) + c = '' if any(e in id for e in spaceChr): sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, faFile)) if any(c for c in seq if not c.isalpha()): @@ -131,6 +138,8 @@ def checkDataFolder(checkDir, replace, delete, concat): checkFaFile = checkValidFasta(faFile) if checkFaFile == 'notFasta': sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) + elif checkFaFile == 'longHeader': + sys.exit('*** ERROR: %s contains long headers!' % faFile) elif checkFaFile == 'space': sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) elif checkFaFile == 'multiLine': @@ -184,7 +193,7 @@ def checkMissingNcbiID(namesDmp, taxaList): return(missingTaxa.keys(), dupTaxa) def main(): - version = '0.0.2' + version = '0.0.3' parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') diff --git a/fdog/data/.DS_Store b/fdog/data/.DS_Store index fde072a6aebc6f6618808f5bbd3cd63c202098d1..bf1ded6ef3f07fae44d0ee29b918a7b6e62c579b 100644 GIT binary patch delta 166 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jGrVU^g=($7CLXsf?#4TM0L4 zNmf@I8kp!P7#bVa>L^qj8i6?GCMLDDoE+k+hPIvwxs_GbHMMoKCQlHOXPiAbTTpxR zRiQq{iKUMgvvY6=G6T&A0s(Fy;R>>9W8rt^$^0^&Ad4B8AdUdJi(zv-&m3j|>H;Ig delta 201 zcmZoMXmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA$h|ROH}hr%jz7$c**Q2SHn1>q zPv&8nI{7#2*~$5=8m^oSNenp*i44UIB@FQlDGZqmMV>kN$w@i+Ng!i@hW`Z8djG)y z$YNl?qGR$0)*IYN;+q>-mojcn=2^|mCBY5U;tF!mW locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merge overlapping regions plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + i += 1 + j += 1 + + number_regions += len(locations) + blast_results[key] = locations + + return blast_results, number_regions + +def parse_blast(line, blast_results, cutoff): + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,)]} + line = line.replace("\n", "") + line_info = line.split("\t") + evalue = float(line_info[3]) + #cut off + if evalue > cutoff: + return blast_results, evalue + #add region to dictionary + else: + node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + split = node_name.split("|") + # finding out on which strand tBLASTn found a hit + if sstart < send: + strand = "+" + else: + sstart = int(line_info[2]) + send = int(line_info[1]) + strand = "-" + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off + if len(split) > 1: + node_name = split[1] + if node_name in blast_results: + list = blast_results[node_name] + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + blast_results[node_name] = list + else: + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + + return blast_results, evalue + +def candidate_regions(intron_length, cutoff_evalue, tmp_path): + ###################### extracting candidate regions ######################## + # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 + blast_file = open(tmp_path + "/blast_results.out", "r") + evalue = 0 + blast_results = {} + #parsing blast output + while True: + line = blast_file.readline() + #end of file is reached + if not line: + break + #parsing blast output + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) + + if blast_results == {}: + return 0,0 + else: + candidate_regions, number_regions = merge(blast_results, intron_length) + + return candidate_regions, number_regions + +def extract_seq(region_dic, path, tmp_path, mode): + + for key in region_dic: + #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") + cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" + starting_subprocess(cmd, mode) + +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): + output = open(candidatesOutFile, "w") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + # some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + # augutus call + cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" + #print(cmd) + starting_subprocess(cmd, 'silent') + # transfer augustus output to as sequence + cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + print("No gene found in region with ID:" + name + " , continuing with next region") + output.close() + +def searching_for_db(assembly_path): + + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] + check = True + for end in db_endings: + check = check and os.path.exists(assembly_path + end) + return check + +def get_distance_biopython(file, matrix): + aln = AlignIO.read(open(file), 'fasta') + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) + return dm + +def readFasta(candidatesOutFile): + seq_records = SeqIO.parse(candidatesOutFile, "fasta") + return seq_records + +def getSeedInfo(path): + dic = {} + seq_records = readFasta(path) + for entry in seq_records: + species = entry.id.split("|")[1] + geneID = entry.id.split("|")[2] + + try: + dic[species].append(geneID) + except KeyError: + dic[species] = [geneID] + + del seq_records + return dic + +def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path): + ###########getting sequences and write all in one file to make msa ######### + name_file = candidate_name + ".co" + output_file = tmp_path + name_file + '.fasta' + aln_file = tmp_path + name_file + '.aln' + genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + #print(searchTool) + + out = open(output_file, "w") + inSeq = SeqIO.to_dict((SeqIO.parse(open(genome_dir_path), 'fasta'))) + out.write(">" + best_hit + "\n") + out.write(str(inSeq[best_hit].seq) + "\n") + out.write(">" + ref + "\n") + out.write(str(inSeq[ref].seq )+ "\n") + + candidates = readFasta(candidatesOutFile) + for record in candidates: + if candidate_name in record.id: + out.write(">" + candidate_name + "\n") + out.write(str(record.seq) + "\n") + break + + out.close() + + if msaTool == "muscle": + os.system("muscle -quiet -in " + output_file + " -out " + aln_file) + #print("muscle -quiet -in " + output_file + " -out " + aln_file) + elif msaTool == "mafft-linsi": + #print("mafft-linsi") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + + distances = get_distance_biopython(aln_file, matrix) + + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + + if distance_ref_hit < distance_hit_query: + #accepted + return 1, distance_ref_hit, distance_hit_query + + else: + #rejected + return 0, distance_ref_hit, distance_hit_query + +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): + # the backward search uses the genes predicted from augustus and makes a blastp search + #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa + seedDic = getSeedInfo(fasta_path) + #print(fasta_path) + orthologs = [] + #print(seedDic) + blast_dir_path = dataPath + "/blast_dir/" + if strict != True: + seed = [fdog_ref_species] + try: + id_ref = seedDic[fdog_ref_species] + except KeyError: + print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + return 0, seed + if searchTool == "blast": + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) + else: + print("diamonds are the girls best friends") + ##### diamond call + + alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene, evalue = (line.replace("\n", "")).split("\t") + gene_name = gene.split("|")[2] + if gene_name != old_name: + print("candidate:%s"%(gene_name)) + print("blast-hit:%s"%(id)) + min = float(evalue) + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + orthologs.append(gene) + elif co_orthologs_result == 0: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + else: + print("\tnothitting\n") + elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + orthologs.append(gene) + elif co_orthologs_result == 0: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + else: + print("\tnot hitting\n") + old_name = gene_name + + + if orthologs == []: + print("No hit in the backward search, ...exciting") + return 0, seed + + else: + if taxa != []: + seed = taxa + try: + i = seed.index(fdog_ref_species) + seed.insert(0,seed.pop(i)) + except ValueError: + seed.insert(0,fdog_ref_species) + #print(seed) + #print("with taxa list from user input") + + else: + seed = [] + for key in seedDic: + if key == fdog_ref_species: + seed.insert(0,key) + else: + seed.append(key) + + orthologs = set({}) + + for species in seed: + print("backward search in species " + species + "\n") + orthologs_new = set({}) + try: + id_ref = seedDic[species] + except KeyError: + print("The species " + species + " isn't part of the core ortholog group, ... exciting") + return 0, seed + + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) + alg_file = open(tmp_path + "/blast_" + species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene_name, evalue = (line.replace("\n", "")).split("\t") + if gene_name != old_name: + min = float(evalue) + if id in id_ref: + orthologs_new.add(gene_name) + + elif (gene_name == old_name) and float(evalue) == min: + if id in id_ref: + orthologs_new.add(gene_name) + + #print(species) + #print(orthologs_new) + if species == fdog_ref_species: + orthologs = orthologs_new + else: + orthologs = orthologs & orthologs_new + if orthologs == {}: + print("No ortholog was found with option --strict") + return 0, seed + + + + #print(orthologs) + orthologs = set(orthologs) + return list(orthologs), seed + +def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + + output_file = open(output, "a+") + if refBool == False: + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + return 0 + +def createFasInput(orthologsOutFile, mappingFile): + with open(orthologsOutFile, "r") as f: + fas_seed_id = (f.readline())[1:-1] + #fas_seed_id = fas_seed_id.split("|")[0] + + mappingFile = open(mappingFile, "a+") + + seq_records = readFasta(orthologsOutFile) + for seq in seq_records: + ncbi_id = (seq.id.split("@"))[1] + mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") + + + return fas_seed_id + +def cleanup(tmp, tmp_path): + if tmp == False: + os.system('rm -r ' + tmp_path) + +def checkOptions(): + pass + #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! + +def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): + if len(candidate_names) == 1: + return candidate_names + + candidates = readFasta(candidatesFile) + ref = readFasta(fasta) + + out = tmp_path + '/checkCoorthologs.fa' + f = open(out,"w") + + aln_file = tmp_path + '/checkCoorthologs.aln' + + for record in ref: + if fdog_ref_species in record.id: + ref_id = record.id + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + break + + for record in candidates: + for name in candidate_names: + if name in record.id: + f.write(">" + name + "\n") + f.write(str(record.seq) + "\n") + f.close() + + if msaTool == "muscle": + os.system("muscle -quiet -in " + out + " -out " + aln_file) + elif msaTool == "mafft-linsi": + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) + + distances = get_distance_biopython(aln_file, matrix) + + min_dist = 10 + min_name = None + + for name in candidate_names: + distance = distances[ref_id , name] + if distance <= min_dist: + min_dist = distance + min_name = name + + checked = [min_name] + + for name in candidate_names: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: + checked.append(name) + + return checked + +def clean_fas(path, file_type): + file = open(path, "r") + lines = file.readlines() + file.close() + file = open(path,"w") + + for line in lines: + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t", 1) + id = long_id.split("|")[0] + new_line = id + "\t" + remain + + file.write(new_line) + +class Logger(object): + def __init__(self, file): + self.file = file + self.terminal = sys.stdout + self.log = self.file + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + pass + + +def main(): + + #################### handle user input ######################################## + + version = '0.0.1' + + parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') + parser.add_argument('--version', action='version', version=str(version)) + + required = parser.add_argument_group('Required arguments') + required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', + action='store', default='', required=True) + required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) + required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) + + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) + optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') + optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) + optional.add_argument('--out', help='Output directory', action='store', default='') + optional.add_argument('--dataPath', help='data directory', action='store', default='') + optional.add_argument('--coregroupPath', help='core_ortholog directory', action='store', default='') + optional.add_argument('--searchTool', help='Choose between blast and diamond as alignemnt search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') + optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) + optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) + optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') + optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) + optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') + optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) + optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') + optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + + + args = parser.parse_args() + + # required + group = args.gene + augustus_ref_species = args.augustusRefSpec + fdog_ref_species = args.refSpec + #paths user input + assemblyDir = args.assemblyPath + dataPath = args.dataPath + core_path = args.coregroupPath + out = args.out + pathFile = args.pathFile + #I/O + tmp = args.tmp + strict = args.strict + checkCoorthologs = args.checkCoorthologsRef + filter = args.filter + if filter == True or filter == 'yes': + filter = 'yes' + else: + filter = 'no' + #others + average_intron_length = args.avIntron + length_extension = args.lengthExtension + searchTool = args.searchTool + evalue = args.evalBlast + msaTool = args.msaTool + matrix = args.scoringmatrix + taxa = args.coreTaxa + if taxa == '': + taxa =[] + else: + taxa = taxa.split(",") + fasoff = args.fasoff + searchTaxon = args.searchTaxon + silent = args.silent + debug = args.debug + + if debug == True and silent == True: + print("It's not possible to use booth modes, please restart and use --debug or --silent") + return 1 + else: + if debug == True: + mode = 'debug' + elif silent == True: + mode = 'silent' + else: + mode = 'normal' + + #checking paths + if dataPath == '': + fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') + configFile = fdogPath + '/bin/pathconfig.txt' + if not os.path.exists(configFile): + sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog) or give a dataPath') + if pathFile == '': + with open(configFile) as f: + dataPath = f.readline().strip() + else: + cfg = load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if core_path == '': + core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + if out == '': + #print('test out \n') + out = os.getcwd() + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + else: + if out[-1] != "/": + out = out + "/" + + + try: + f = open(out + "/fdog.log", "a+") + except FileNotFoundError: + f = open(out + "/fdog.log", "w") + + ################## How to handle std output and std error ################## + + if mode == 'silent': + sys.stderr = f + sys.stdout = f + else: + sys.stdout = Logger(f) + + # user input has to be checked here before fDOGassembly continues + assembly_names = os.listdir(assemblyDir) + + ########################## some variables ################################## + + refBool = False # checks if sequences of reference species were already part of the extended.fa file + + ########### paths ########### + + msa_path = core_path + "/" + group +"/"+ group + ".aln" + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + consensus_path = out + "/tmp/" + group + ".con" + profile_path = out + "/tmp/" + group + ".prfl" + + ###################### create tmp folder ################################### + + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') + + ######################## consensus sequence ################################ + + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence for gene " + group + " \n") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("consensus sequence is finished\n") + + ######################## block profile ##################################### + + print("Building a block profile for gene " + group + " \n") + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("block profile is finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + #print(cmd) + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + #print(cmd) + starting_subprocess(cmd, 'silent') + print("block profile is finished \n") + + searchBool = False + + #################### fDOG assembly computation for all species ############# + for asName in assembly_names: + if searchBool == True: + break + if searchTaxon != '' and searchBool == False: + asName = searchTaxon + searchBool = True + + ################### path definitions ################################### + + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "/tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + if searchTaxon != '': + orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" + fasOutFile = out + "/" + group + "_" + asName + mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" + else: + orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + + ######################## tBLASTn ########################################### + #checks if data base exists already + db_check = searching_for_db(db_path) + if db_check == 0: + print("creating a blast data base \n") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("database is finished \n") + else: + print('blast data base exists already, continuing...') + + #makes a tBLASTn search against the new database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("tBLASTn search against data base") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + starting_subprocess(cmd, mode) + print("tBLASTn search is finished") + + ################### search for candidate regions and extract seq ########### + # parse blast and filter for candiate regions + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found") + if refBool == True: + continue + else: + taxa = [fdog_ref_species] + reciprocal_sequences = 0 + else: + print(str(number_regions) + " candiate regions were found. Extracting sequences...") + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + + print("starting augustus ppx \n") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("augustus is finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate regions\n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + taxa = [fdog_ref_species] + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + + ################## checking accepted genes for co-orthologs ################ + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + ################ add sequences to extended.fa in the output folder########## + + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + refBool = True + + ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon + if searchTaxon != '' and fasoff == False: + print("Calculating FAS scores") + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + starting_subprocess(cmd, 'silent') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + + + #if we searched in more than one Taxon and no ortholog was found + + if refBool == False and searchTaxon == '': + print("No orthologs found. Exciting ...") + cleanup(tmp, tmp_path) + return 1 + #if we searched in more than one taxon + if fasoff == False and searchTaxon == '': + print("Calculating FAS scores") + tmp_path = out + '/tmp/' + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + starting_subprocess(cmd, 'silent') + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + ################# remove tmp folder ######################################## + if searchTaxon != '': + cleanup(tmp, tmp_path) + else: + cleanup(tmp, out + "/tmp/") + + f.close() + +if __name__ == '__main__': + main() diff --git a/fdog/fdog_goes_assembly/.DS_Store b/fdog/fdog_goes_assembly/.DS_Store deleted file mode 100644 index e0e9ff1be0aa35d6ef237330e7d7dd1ba746d1ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}(1u5S~p^*rck+0SS(K;TEB|ttxRqG9d_Y0?7ytv;wSiB?DPC6qc=cynH z(?=VMz9V&80{tcXOw7~QTk1`P}iS~paoEd z9Ot|DN7ViMQPD8#xxOBvUJ_?{v-v1W<;sVJMJa2t_Nn$GIM>4<5A#9R3i~HiI|%dh z$k~TlCogsw9mU<@X?=NHr+FBs-M&eVJ6#O+piEGRNGw@$AAS&&k-NH|Dd+W@{(O#=i&rnGyuCVx$f`-|O gF_yOCRa7JBS7adi4hxIuLE(ddp@9o#;GZ(^412?A3IG5A diff --git a/fdog/fdog_goes_assembly/fDOGassembly.py b/fdog/fdog_goes_assembly/fDOGassembly.py deleted file mode 100644 index ad4c362..0000000 --- a/fdog/fdog_goes_assembly/fDOGassembly.py +++ /dev/null @@ -1,209 +0,0 @@ -############################ imports ########################################### -import os -########################### functions ########################################## - - -def merge_regions(blast_results, cut_off): - number_regions = 0 - for key in blast_results: - locations = blast_results[key] - size_list = len(locations) - i = 0 - j = 1 - old_size = 0 - while size_list != old_size and i < size_list: - old_size = size_list - start = locations[i][0] - end = locations[i][1] - - #print(locations) - while j < size_list: - - # breakup point? or we have to skip this j - if (i == j) and (j + 1 < size_list): - j+=1 - elif (i == j): - break - - if (locations[i][0] < locations[j][0]) and (locations[i][1] > locations[j][0]): - # start is between start and end -> merge - locations[i][1] = max(locations[j][1], locations[i][1]) - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][0] < locations[j][1]) and (locations[i][1] > locations[j][1]): - #end is between start and end -> merge - locations[i][0] = min(locations[j][0], locations[i][0]) - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][0] > locations[j][1]) and (locations[i][0] - locations[j][1] <= cut_off): - # end is not more than cut-off distanced - locations[i][0] = locations[j][0] - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][1] < locations[j][0] and locations[j][0] - locations[i][1] <= cut_off): - # start is not more than cut-off distanced - locations[i][0] = locations[j][0] - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - j += 1 - size_list = len(locations) - - i += 1 - j = 0 - number_regions += size_list - - return blast_results, number_regions - - -def parse_blast(line, blast_results): - # format blast line: - #fomrat dictionary: {node_name: [(,)]} - #print(line) - line = line.replace("\n", "") - line_info = line.split("\t") - #print(line_info) - evalue = float(line_info[3]) - - #cut off - if evalue > 0.0001: - return blast_results, evalue - #add region to dictionary - else: - node_name, start, end = line_info[0], line_info[1], line_info[2] - if node_name in blast_results: - list = blast_results[node_name] - list.append([int(start),int(end), evalue]) - blast_results[node_name] = list - else: - blast_results[node_name] = [[int(start),int(end), evalue]] - - return blast_results, evalue - - -def candidate_regions(cut_off): - ###################### extracting candidate regions ######################## - # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 - blast_file = open("tmp/blast_results.out", "r") - - evalue = 0 - blast_results = {} - #parsing blast output - while True: - line = blast_file.readline() - #end of file is reached - if not line: - break - #parsing blast output - blast_results, evalue = parse_blast(line, blast_results) - #evalue cut-off - if not evalue <= 0.00001: - break - if blast_results == {}: - return 1 - else: - candidate_regions, number_regions = merge_regions(blast_results, cut_off) - #print(candidate_regions, number_regions) - return candidate_regions, number_regions - - -def extract_seq(region_dic, path): - #print(region_dic) - for key in region_dic: - os.system("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") - - -def main(): - - ########################### handle user input ############################## - - #user input core_ortholog group - #have to add an input option - - #core-ortholog group name - group = "778452" - - #species name assemblie (folder name in assemby folder) - species_name = "L.pustulata" - - #assembly species_name - assembly_name = "contigs.fa" - - augustus_ref_species = "saccharomyces_cerevisiae_S288C" - - cut_off_merging_candidates = 500 - - - ########################## paths ########################################### - - #open core_ortholog group - msa_path = "../data/core_orthologs/" + group +"/"+ group + ".aln" - hmm_path = "../data/core_orthologs/" + group +"/hmm_dir/"+ group + ".hmm" - consensus_path = "tmp/" + group + ".con" - profile_path = "tmp/" + group + ".prfl" - path_assembly = "../data/assembly_dir/" + species_name + "/" + assembly_name - - os.system('mkdir tmp') - - - ######################## consensus sequence ################################ - - #make a majority-rule consensus seqeunce with the tool hmmemit from hmmer - print("Building a consensus sequence \n") - os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) - print("consensus seqeunce is finished\n") - - ######################## block profile ##################################### - print("Building a block profile \n") - - os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path) - print("block profile is finished \n") - ######################## tBLASTn ########################################### - - #database anlegen - print("creating a blast database \n") - os.system('makeblastdb -in ' + path_assembly + ' -dbtype nucl -parse_seqids -out ' + path_assembly) - print("database is finished \n") - - #make a tBLASTn search against the new database - - os.system('tblastn -db ' + path_assembly + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue bitscore" -out tmp/blast_results.out') - - ################### search for candidate regions and extract seq ########### - - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(cut_off_merging_candidates) - - if regions == 1: - #no candidat region are available, no ortholog can be found - print("No candidate region found") - os.system('rm -r tmp/') - return 1 - - else: - print(str(number_regions) + " candiate regions were found. Extracting sequences.") - extract_seq(regions, path_assembly) - - ############### make Augustus PPX search #################################### - for key in regions: - locations = regions[key] - counter = 0 - for i in locations: - counter += 1 - start = str(i[0]) - end = str(i[1]) - if start < end: - #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff") - os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff") - else: - os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + end + " --predictionEnd=" + start + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff") - - ################# remove tmp folder ######################################## - - #have to be added after program ist finished, maybe use parametere so that the user can turn it off - -if __name__ == '__main__': - main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py new file mode 100644 index 0000000..1606b1d --- /dev/null +++ b/fdog/mergeAssemblyOutput.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Vinh Tran +# +# This script is used to merge all output files (.extended.fa, .phyloprofile, +# _forward.domains, _reverse.domains) in a given directory into one file each. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@stud.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from os import listdir as ldir +import argparse +from pathlib import Path + +def main(): + version = '0.0.1' + parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') + parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + action='store', default='', required=True) + parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) + parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) + args = parser.parse_args() + + directory = args.input + out = args.output + cleanup = args.cleanup + if not os.path.exists(os.path.abspath(directory)): + sys.exit('%s not found' % directory) + else: + directory = os.path.abspath(directory) + + phyloprofile = None + set_phylo = set() + domains_0 = None + set_domains_f = set() + domains_1 = None + set_domains_r = set() + ex_fasta = None + set_fasta = set() + header_bool = False + for infile in ldir(directory): + if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': + if not phyloprofile: + phyloprofile = open(out + '.phyloprofile', 'w') + phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: + phyloprofile.write(line) + if len(lines) > 1: + set_phylo = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': + if not domains_0: + domains_0 = open(out + '_forward.domains', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line not in set_domains_f: + domains_0.write(line) + if len(lines) > 1: + set_domains_f = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': + if not domains_1: + domains_1 = open(out + '_reverse.domains', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line not in set_domains_r: + domains_1.write(line) + if len(lines) > 1: + set_domains_r = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': + if not ex_fasta: + ex_fasta = open(out + '.extended.fa', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + header = set() + #print(set_fasta) + for line in lines: + if line[0] == ">": + header.add(line) + if line not in set_fasta: + ex_fasta.write(line) + header_bool = True + else: + header_bool = False + else: + if header_bool == True: + ex_fasta.write(line) + set_fasta = header + if cleanup == True: + os.remove(directory + '/' +infile) + elif infile.endswith('.tsv'): + os.remove(directory + '/' + infile) + + if phyloprofile: + phyloprofile.close() + if domains_0: + domains_0.close() + if domains_1: + domains_1.close() + if ex_fasta: + ex_fasta.close() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 65335d5..a696495 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -28,6 +28,7 @@ from tqdm import tqdm import fdog.runSingle as fdogFn import shutil +import yaml def getSortedFiles(directory): list = os.listdir(directory) @@ -46,8 +47,8 @@ def prepare(args, step): outpath, hmmpath, blastpath, searchpath, weightpath, coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent) = args + strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, + cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args mute = False if step == 'core': @@ -67,9 +68,10 @@ def prepare(args, step): pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] + orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, debug, True] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) + assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -104,17 +106,20 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - pool = mp.Pool(cpu) - coreOut = [] - for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): - coreOut.append(_) - pool.close() - pool.join() + + if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + if len(coreCompilationJobs) > 0: + pool = mp.Pool(cpu) + coreOut = [] + for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): + coreOut.append(_) + pool.close() + pool.join() + # read logs file to get runtime for individual seeds + getIndividualRuntime('core', outpath, seeds) end = time.time() - # read logs file to get runtime for individual seeds - getIndividualRuntime('core', outpath, seeds) multiCoreTime = '{:5.3f}'.format(end-start) print('==> Core compiling finished in %s sec' % multiCoreTime) #'{:5.3f}s'.format(end-start)) return(multiCoreTime) @@ -126,7 +131,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -139,7 +144,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): print('==> Ortholog search finished in %s sec' % multiOrthoTime) return(multiOrthoTime) -def joinOutputs(outpath, jobName, seeds, keep): +def joinOutputs(outpath, jobName, seeds, keep, silent): print('Joining single outputs...') finalFa = '%s/%s.extended.fa' % (outpath, jobName) Path(outpath+'/singleOutput').mkdir(parents=True, exist_ok=True) @@ -147,14 +152,20 @@ def joinOutputs(outpath, jobName, seeds, keep): for seed in seeds: seqName = getSeedName(seed) resultFile = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName) + if silent == False: + print(resultFile) if os.path.exists(resultFile): with open(resultFile,'rb') as fd: shutil.copyfileobj(fd, wfd) shutil.move(outpath + '/' + seqName, outpath + '/singleOutput') else: Path(outpath+'/missingOutput').mkdir(parents=True, exist_ok=True) - shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if not os.path.exists(outpath + '/missingOutput/' + seqName): + shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if os.path.exists(outpath + '/' + seqName + '.fa'): os.remove(outpath + '/' + seqName + '.fa') + if os.path.exists(os.getcwd() + '/' + seqName + '.fa'): + os.remove(os.getcwd() + '/' + seqName + '.fa') if keep == True: try: print('Compressing single outputs...') @@ -180,7 +191,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu): sys.exit('Problem running\n%s' % (fasCmd)) def main(): - version = '0.0.13' + version = '0.0.33' parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -197,10 +208,12 @@ def main(): optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + addtionalIO.add_argument('--forceComplete', help='Overwrite existing core orthologs and all output files', action='store_true', default=False) addtionalIO.add_argument('--cleanup', help='Temporary output will be deleted. Default: True', action='store_true', default=True) addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') @@ -229,8 +242,15 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) + core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', + action='store_true', default=False) + core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', + action='store_true', default=True) + core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', + action='store_true', default=False) ortho_options = parser.add_argument_group('Search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', @@ -239,9 +259,7 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', action='store_true', default=False) - ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False', + ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005', action='store', default=0.00005, type=float) @@ -257,13 +275,6 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', - choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') fas_options = parser.add_argument_group('FAS options') fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) @@ -274,11 +285,21 @@ def main(): fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) optional = parser.add_argument_group('Other options') + optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', + choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + assembly_options = parser.add_argument_group('Assembly options') + assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) + assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') + assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) + assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) + assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') + assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -297,10 +318,12 @@ def main(): blastpath = args.blastpath searchpath = args.searchpath weightpath = args.weightpath + pathFile = args.pathFile # other I/O arguments append = args.append force = args.force + forceComplete = args.forceComplete cleanup = args.cleanup keep = args.keep group = args.group @@ -323,7 +346,7 @@ def main(): rbh = args.rbh rep = args.rep ignoreDistance = args.ignoreDistance - lowComplexityFilterOff = args.lowComplexityFilterOff + lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer evalRelaxfac = args.evalRelaxfac @@ -351,22 +374,89 @@ def main(): silent = False else: silent = True + + #fdog_goes_assembly arguments + assembly = args.assembly + assemblyFile = args.assemblyFile + augustusRefSpec = args.augustusRefSpec + avIntron = args.avIntron + lengthExtension = args.lengthExtension + searchTool = args.searchTool + matrix = args.scoringmatrix + + ### check fas + if not fasoff: + try: + fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True) + except: + sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + + ### delete output folder and files if needed + if forceComplete: + if os.path.exists(outpath): + print("Removing existing output directory %s" % outpath) + shutil.rmtree(outpath) + Path(outpath).mkdir(parents=True, exist_ok=True) + if force: + if os.path.exists(outpath): + print("Removing existing files %s in %s*" % (jobName, outpath)) + outfiles = os.listdir(outpath) + for item in outfiles: + if item.startswith(jobName): + os.remove(os.path.join(outpath, item)) + if item.startswith("runtime"): + os.remove(os.path.join(outpath, item)) + if os.path.exists(outpath + '/missing.txt'): + os.remove(outpath + '/missing.txt') ### get fdog and data path + dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runMulti.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' if not os.path.exists(pathconfigFile): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + if pathFile == '': + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + else: + cfg = fdogFn.load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if hmmpath == '': - hmmpath = dataPath + '/core_orthologs' + hmmpath = outpath + '/core_orthologs' + # hmmpath = dataPath + '/core_orthologs' + # if dataPath == 'config': + # try: + # hmmpath = cfg['hmmpath'] + # except: + # sys.exit('hmmpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + else: + hmmpath = os.path.abspath(hmmpath) if blastpath == '': blastpath = dataPath + '/blast_dir' + if dataPath == 'config': + try: + blastpath = cfg['blastpath'] + except: + sys.exit('blastpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) if searchpath == '': searchpath = dataPath + '/genome_dir' + if dataPath == 'config': + try: + searchpath = cfg['searchpath'] + except: + sys.exit('searchpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) if weightpath == '': weightpath = dataPath + '/weight_dir' + if dataPath == 'config': + try: + weightpath = cfg['weightpath'] + except: + sys.exit('weightpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + ### join options options = [fdogPath, refspec, minDist, maxDist, coreOrth, @@ -374,10 +464,11 @@ def main(): outpath, hmmpath, blastpath, searchpath, weightpath, coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent] + strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, + cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] ### START + Path(outpath).mkdir(parents=True, exist_ok=True) multiLog = open(outpath + '/' + jobName + '_log.txt', "w") fdogStart = time.time() seeds = getSortedFiles(inFol) @@ -388,30 +479,40 @@ def main(): if reuseCore == False: multiCoreTime = compileCore(options, seeds, inFol, cpu, outpath) multiLog.write('==> Core compilation finished in %s sec\n' % multiCoreTime) + else: + if not os.path.exists(hmmpath): + sys.exit('--reuseCore was set, but no core orthologs found in %s! You could use --hmmpath to manually specify the core ortholog directory.' % outpath) ### do ortholog search if coreOnly == False: - ### create list of search taxa - searchTaxa = '' - searchGroup = 'all' - if not group == '': - print('Creating list for search taxa...') - searchTaxa = '%s/searchTaxa.txt' % (outpath) - searchGroup = group - cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - ### run ortholog search - multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) - ### join output - finalFa = joinOutputs(outpath, jobName, seeds, keep) + if not os.path.exists('%s/%s.extended.fa' % (outpath, jobName)): + ### create list of search taxa + searchTaxa = '' + searchGroup = 'all' + if not group == '': + print('Creating list for search taxa...') + searchTaxa = '%s/searchTaxa.txt' % (outpath) + searchGroup = group + cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) + try: + subprocess.call([cmd], shell = True) + except: + sys.exit('Problem running\n%s' % (cmd)) + ### run ortholog search + multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) + multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) + ### join output + finalFa = joinOutputs(outpath, jobName, seeds, keep, silent) + else: + print("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force option." % (jobName, outpath)) ### calculate FAS scores if fasoff == False: - fasTime = calcFAS(outpath, finalFa, weightpath, cpu) - multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + if not os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): + if os.path.exists(finalFa) and os.path.getsize(finalFa) > 0: + fasTime = calcFAS(outpath, finalFa, weightpath, cpu) + multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + else: + print("Final fasta file %s not exists or empty!" % finalFa) fdogEnd = time.time() print('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index f235ff8..a0ded09 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -20,13 +20,24 @@ import argparse import subprocess from pathlib import Path +import yaml def checkFileExist(file): if not os.path.exists(os.path.abspath(file)): sys.exit('%s not found' % file) +def load_config(config_file): + with open(config_file, 'r') as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + def checkInput(args): (fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath) = args + # create output directory + Path(outpath).mkdir(parents=True, exist_ok=True) + Path(hmmpath).mkdir(parents=True, exist_ok=True) # check path existing for path in [hmmpath, blastpath, searchpath, weightpath]: checkFileExist(path) @@ -38,8 +49,6 @@ def checkInput(args): seqFile = fdogPath + '/data/' + seqFile else: seqFile = os.path.abspath(seqFile) - # create output directory - Path(outpath).mkdir(parents=True, exist_ok=True) # check refspec if not os.path.exists(os.path.abspath(blastpath+'/'+refspec)): exit('Reference taxon %s not found in %s' % (refspec, blastpath)) @@ -56,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) + (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -98,7 +107,7 @@ def runSingle(args): if not distDeviation == 0.05: cmd = cmd + ' -distDeviation=%s' % distDeviation # add ortholo search options - (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs + (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs if strict == True: cmd = cmd + ' -strict' if checkCoorthologsRef == True: @@ -109,8 +118,8 @@ def runSingle(args): cmd = cmd + ' -rep' if ignoreDistance == True: cmd = cmd + ' -ignoreDistance' - if lowComplexityFilterOff == True: - cmd = cmd + ' -filter=F' + if lowComplexityFilter == True: + cmd = cmd + ' -filter=T' if not evalBlast == 0.00005: cmd = cmd + ' -evalBlast=%s' % evalBlast if not evalHmmer == 0.00005: @@ -152,7 +161,28 @@ def runSingle(args): cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # print(cmd) + # add assembly options + (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs + if assembly == True: + cmd = cmd + ' -assembly' + cmd = cmd + ' -reuseCore' + if not augustusRefSpec == '': + cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec + else: + sys.exit('An augutus reference species is requiered by using the option --assembly') + if not avIntron == '': + cmd = cmd + ' -avIntron=%s' % avIntron + if not lengthExtension == '': + cmd = cmd + ' -lengthExtension=%s' % lengthExtension + if not assemblyFile == '': + cmd = cmd + ' -assemblyFile=%s' % assemblyFile + if not searchTool == '': + cmd = cmd + ' -searchTool=%s' % searchTool + if not matrix == '': + cmd = cmd + ' -scoringmatrix=%s' % matrix + if not dataPath == '': + cmd = cmd + ' -dataPath=%s' % dataPath + #print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -161,7 +191,7 @@ def runSingle(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.13' + version = '0.0.33' parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -178,6 +208,9 @@ def main(): optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') + optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') + addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -209,8 +242,15 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) + core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', + action='store_true', default=False) + core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', + action='store_true', default=True) + core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', + action='store_true', default=False) ortho_options = parser.add_argument_group('Ortholog search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', @@ -219,9 +259,7 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', action='store_true', default=False) - ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False', + ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005', action='store', default=0.00005, type=float) @@ -237,13 +275,6 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', - choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') fas_options = parser.add_argument_group('FAS options') fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) @@ -254,11 +285,21 @@ def main(): fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) optional = parser.add_argument_group('Other options') + optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', + choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + assembly_options = parser.add_argument_group('Assembly options') + assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) + assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') + assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) + assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) + assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') + assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -277,6 +318,8 @@ def main(): blastpath = args.blastpath searchpath = args.searchpath weightpath = args.weightpath + pathFile = args.pathFile + assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -302,7 +345,7 @@ def main(): rbh = args.rbh rep = args.rep ignoreDistance = args.ignoreDistance - lowComplexityFilterOff = args.lowComplexityFilterOff + lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer evalRelaxfac = args.evalRelaxfac @@ -331,36 +374,86 @@ def main(): else: silent = True + #fdog_goes_assembly arguments + assembly = args.assembly + assemblyFile = args.assemblyFile + augustusRefSpec = args.augustusRefSpec + avIntron = args.avIntron + lengthExtension = args.lengthExtension + searchTool = args.searchTool + matrix = args.scoringmatrix + ### get fdog and data path + dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' if not os.path.exists(pathconfigFile): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + if pathFile == '': + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + else: + cfg = load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if hmmpath == '': - hmmpath = dataPath + '/core_orthologs' + hmmpath = outpath + '/core_orthologs' + # hmmpath = dataPath + '/core_orthologs' + # if dataPath == 'config': + # try: + # hmmpath = cfg['hmmpath'] + # except: + # sys.exit('hmmpath not found in %s' % pathFile) + if blastpath == '': blastpath = dataPath + '/blast_dir' + if dataPath == 'config': + try: + blastpath = cfg['blastpath'] + except: + sys.exit('blastpath not found in %s' % pathFile) if searchpath == '': searchpath = dataPath + '/genome_dir' + if dataPath == 'config': + try: + searchpath = cfg['searchpath'] + except: + sys.exit('searchpath not found in %s' % pathFile) if weightpath == '': weightpath = dataPath + '/weight_dir' + if dataPath == 'config': + try: + weightpath = cfg['weightpath'] + except: + sys.exit('weightpath not found in %s' % pathFile) + + if assemblypath == '': + assemblypath = dataPath + '/assembly_dir' + if dataPath == 'config': + try: + assemblypath = cfg['assemblypath'] + except: + sys.exit('assemblypath not found in %s' % pathFile) + if assembly == True: + searchpath = assemblypath ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] + orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, debug, silent] + assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) if __name__ == '__main__': main() diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 2894e6c..3c561e7 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -199,7 +199,8 @@ fi data_fdog_file="data_HaMStR-2019c.tar.gz" checkSumData="1748371655 621731824 $data_fdog_file" cd $outDir -if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi +if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi +if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi if ! [ "$(ls -A $outDir/genome_dir)" ]; then echo "-------------------------------------" diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index b8c90e6..ddc4e23 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -116,6 +116,7 @@ dependencies=( mafft # for linsi muscle fasta36 + augustus #for fdog.assembly ) for i in "${dependencies[@]}"; do @@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do fi elif [ "$tool" = "fasta36" ]; then conda install -y -c bioconda fasta3 + elif [ "$tool" = "augustus" ]; then + conda install -y -c bioconda augustus else conda install -y -c bioconda $i fi @@ -258,7 +261,8 @@ echo "done!" data_fdog_file="data_HaMStR-2019c.tar.gz" checkSumData="1748371655 621731824 $data_fdog_file" cd $outDir -if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi +if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi +if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi if ! [ "$(ls -A $outDir/genome_dir)" ]; then echo "-------------------------------------" diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 18c5368..b6a67d6 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -20,6 +20,7 @@ import os import argparse import subprocess +from ete3 import NCBITaxa from pathlib import Path def checkOptConflict(lib, conda): @@ -28,7 +29,7 @@ def checkOptConflict(lib, conda): sys.exit('*** ERROR: --lib and --conda cannot be used at the same time!') def main(): - version = '0.0.2' + version = '0.0.3' parser = argparse.ArgumentParser(description='You are running fdog.setup version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -60,6 +61,9 @@ def main(): dataPath = f.readline().strip() print(dataPath) sys.exit() + ### get ncbi taxonomy database for ete3 + print('Creating local NCBI taxonomy database...') + ncbi = NCBITaxa() ### run setup if conda: setupFile = '%s/setup/setup_conda.sh -o %s' % (fdogPath, outPath) diff --git a/setup.py b/setup.py index ad7a1b7..75573c1 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,8 @@ setup( name="fdog", - version="0.0.13", + version="0.0.33", + python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, @@ -41,7 +42,8 @@ 'tqdm', 'ete3', 'six', - 'greedyFAS>=1.4.0' + 'PyYAML', + 'greedyFAS>=1.5.0' ], entry_points={ 'console_scripts': ["fdog.run = fdog.runSingle:main", @@ -52,7 +54,9 @@ "fdog.addTaxa = fdog.addTaxa:main", "fdog.showTaxa = fdog.showTaxa:main", "fdog.mergeOutput = fdog.mergeOutput:main", - "fdog.remove = fdog.removefDog:main"], + "fdog.remove = fdog.removefDog:main", + "fdog.assembly = fdog.fDOGassembly:main", + "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"], }, license="GPL-3.0", classifiers=[ From f8ccac590b46677d81bcc25516666b626a261f2b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 15:09:08 +0200 Subject: [PATCH 094/192] measure computational time --- fdog/fDOGassembly.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index de9f343..3b34a8d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -8,6 +8,7 @@ import argparse import yaml import subprocess +import time ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: @@ -428,10 +429,6 @@ def cleanup(tmp, tmp_path): if tmp == False: os.system('rm -r ' + tmp_path) -def checkOptions(): - pass - #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -520,6 +517,8 @@ def main(): #################### handle user input ######################################## + start = time.clock() + version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -796,6 +795,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + fas = time.clock() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,6 +816,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + fas = time.clock() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -833,5 +834,10 @@ def main(): f.close() + end = time.clock() + + print("Time w/o FAS: " + str(end-fas)) + print("Time complete: " + str(end-start)) + if __name__ == '__main__': main() From 1cf64f1f03dc07357e576744ba3751261b59a77b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 16:25:17 +0200 Subject: [PATCH 095/192] measure computational time --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3b34a8d..229a546 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -517,7 +517,7 @@ def main(): #################### handle user input ######################################## - start = time.clock() + start = time.time() version = '0.0.1' @@ -795,7 +795,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: - fas = time.clock() + fas = time.time() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,7 +816,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': - fas = time.clock() + fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -834,7 +834,7 @@ def main(): f.close() - end = time.clock() + end = time.time() print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) From 6e163ba531b1816eb5faa8f4b315e6e1e5c448ff Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 16:32:40 +0200 Subject: [PATCH 096/192] bug fix --- fdog/fDOGassembly.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 229a546..a6a3bb8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -832,12 +832,14 @@ def main(): else: cleanup(tmp, out + "/tmp/") - f.close() + end = time.time() print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) + f.close() + if __name__ == '__main__': main() From 1d1c47a572015d2cef9e121705993d29090fceee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 09:46:57 +0200 Subject: [PATCH 097/192] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a6a3bb8..5ff5cb1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -836,6 +836,7 @@ def main(): end = time.time() + sys.stdout = sys.__stdout__ print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) From 6e0ce726ffd6425574bbdb73901285463b3af5a4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 10:12:06 +0200 Subject: [PATCH 098/192] computational time output --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5ff5cb1..c8a096b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,8 +837,7 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ - print("Time w/o FAS: " + str(end-fas)) - print("Time complete: " + str(end-start)) + print(group + "\t" + str(end-start) + "\t" + str(end-start)) f.close() From a1cb75d31205dec99f6fe8ef4a6f164395086af3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 10:18:54 +0200 Subject: [PATCH 099/192] corrected computational time output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c8a096b..a3ac854 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,7 +837,7 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ - print(group + "\t" + str(end-start) + "\t" + str(end-start)) + print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() From 328f26dda0e5e1eaaf22dfd37658f5af795c802d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 20 Jul 2021 15:21:28 +0200 Subject: [PATCH 100/192] automatic augustus installation during setup --- fdog/fDOGassembly.py | 20 +++++++++++++++++++- fdog/setup/install_lib.sh | 9 ++++++++- fdog/setup/setup.sh | 1 + 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a3ac854..2575b05 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + ############################ imports ########################################### import os import os.path @@ -519,7 +537,7 @@ def main(): start = time.time() - version = '0.0.1' + version = '0.1.1' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index ff81e88..2e8ff02 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -85,6 +85,7 @@ dependenciesUbuntu=( perl-doc locales lib32z1 + augustus ) dependenciesMac=( @@ -94,6 +95,7 @@ dependenciesMac=( mafft brewsci/bio/muscle blast + augustus ) if [ "$sys" == "Darwin" ]; then @@ -108,7 +110,11 @@ else sudo apt-get update -y for i in "${dependenciesUbuntu[@]}"; do echo $i - sudo apt-get install -y -qq $i > /dev/null + if ["$i" == "augustus"]; then + sudo apt install augustus > /dev/null + else + sudo apt-get install -y -qq $i > /dev/null + fi done fi @@ -119,6 +125,7 @@ dependencies=( mafft muscle blastn + augustus ) for i in "${dependencies[@]}"; do diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 3c561e7..d9e0077 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -309,6 +309,7 @@ mafft muscle clustalw blastp +augustus ) for i in "${dependencies[@]}"; do From 594715da279c625b4b9ff03fca153c7bcfde4695 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:43:59 +0200 Subject: [PATCH 101/192] added tblastn version check --- fdog/setup/setup.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index d9e0077..96ac1c1 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -310,6 +310,7 @@ muscle clustalw blastp augustus +tblastn ) for i in "${dependencies[@]}"; do @@ -319,6 +320,13 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi + if [ $tool == tblastn]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi + fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" flag=1 From be91b3b6d3577b91bf69c73bec3a2dec4c316d5c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:48:43 +0200 Subject: [PATCH 102/192] bug fix --- fdog/setup/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 96ac1c1..e562ca8 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -320,10 +320,10 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi - if [ $tool == tblastn]; then + if [ $tool == "tblastn"]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" fi fi From 4b5fb49a019ab87560a383fa05e0f40e2143b501 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:55:12 +0200 Subject: [PATCH 103/192] bug fix --- fdog/setup/setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index e562ca8..d5d740b 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -320,12 +320,12 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi - if [ $tool == "tblastn"]; then + if [ $tool == "tblastn" ]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then - echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - fi + # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + # echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + # fi fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" From c630d75f8ce7710924482f03bdf3e19796d471ac Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 16:15:41 +0200 Subject: [PATCH 104/192] testing BLAST version check --- fdog/setup/setup.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index d5d740b..1f74552 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -323,9 +323,10 @@ for i in "${dependencies[@]}"; do if [ $tool == "tblastn" ]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then - # echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - # fi + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" From f31cebf94a9c5161023182b307630f0f6d9e1e50 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 16:22:54 +0200 Subject: [PATCH 105/192] tblastn version check during fdog.setup --conda --- fdog/setup/setup_conda.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index ddc4e23..7b4bd08 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -369,6 +369,8 @@ clustalw mafft muscle fasta3 +augustus +tblastn ) for i in "${condaPkgs[@]}"; do if [[ -z $(conda list | $grepprog "$i ") ]]; then @@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do progname="hmmsearch" elif [ "$i" == "fasta3" ]; then progname="fasta36" + elif [ "$i" == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $progname)" ]; then echo -e "\t\e[31m$i could not be installed\e[0m" From 6edf7a01486af4cde7da9a2a028936d5f7710d86 Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:36:20 +0200 Subject: [PATCH 106/192] Fdog goes assembly (#10) * bug fix * bug fix * fixed error mapping ID file not found * testing * testing * testing * test * test * testing * testing * testing * testing * fDOGassembly is working on complete assembly_dir * bug fix * bug fix * enabled option -filter for blastp search * bug fix fasoff * testing --strict option * bug fix in --strict option, output is corrected * bug fix in --checkCoorthologsRef * bug fix * clean up * bug fix * adapted handling of variable dataPath * testing * testing * testing * testing * test * test * test * test * test * test * testing * bug fix assemblyDir * testing * testing * testing search taxa * test * enable --searchTaxa option in fdog.assembly * bug fix * testing * testing --searchTaxa adaption * testing * test * test * write debug files to output dir * skip fa.mapping while checking genome_dir * testing * bug fix * testing * bug fix * bug fix * path fix in augustus_ppx * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * testing * testing * added new python script to merge Assembly output from the same Gene but different searchTaxa * added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa * bug fix * corrected fdog.mergeAssembly call * testing * testing * testing * test * moved fdog.mergeAssembly call to another place * testing * testing * testing * testing * testing * testing * corrected fdog.mergeAssembly call * testing * testing * testing * testing * test * disable weight_dir check if option --assembly is used * adapted fdog.assembly call * adapted calcFAS call to deactivate .tsv output * testing * testing * bug fix in function backward search used with option --strict * testing new added option --silent * added more checks to fdogs.run * bug fix * testing * testing * testing * bug fix * bug fix * testing * testing silent mode * testing --silent * symlinks for fasta36 input; improved fdogs.run according to #5 * testing * testing * testing * testing * tetsing * testing * testing * testing * testing * testing * testing * testing * test * test * testing * testing new function to identify coorthologs * testing * testing * testing * testing * testing * testing * testing * testing * testing * finished function coorthologs * bug fix runSingle.py * cleaning output * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * bug fix if augutus can't idetify a gene at a candidate region * testing * bug fix * bug fix * cleaning up * testing * testing * testing * testing * bug fix in merge function, regions in minus strand were not merged correctly * testing * testing * testing * testing * testing * bug fix * testing * testing * testing * testing * testing * clean up * testing * testing * testing * testing * bug fix * testing new tblastn call * testing * testing * testing * testing * testing * code clean up * clean up code * clean up * clean up * reduce output * clean up code * check augustus * testing * adding option to recognize if co-ortholog or not in header of the extended.fa * testing * testing * testing * testing * testing * testing * testing * testing * added function starting_subprocess() to handle call of extern tools more easily * added augustus to dependencies * testing * bug fix * testing * testing * testing * testing * testing * testing * testing * testing * testing * added function to clean up .domain files * testing * testing * testing * testing * improve user output * fdog.assembly started with fDOG is always silent * testing * testing output * testing * testing * testing * testing * testing * removing automatically .tsv files if existing * measure computational time * measure computational time * bug fix * testing * computational time output * corrected computational time output * automatic augustus installation during setup * added tblastn version check * bug fix * bug fix * testing BLAST version check * tblastn version check during fdog.setup --conda Co-authored-by: trvinh --- fdog/fDOGassembly.py | 34 +++++++++++++++++++++++++++++----- fdog/setup/install_lib.sh | 9 ++++++++- fdog/setup/setup.sh | 10 ++++++++++ fdog/setup/setup_conda.sh | 9 +++++++++ 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index de9f343..46f83c0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + ############################ imports ########################################### import os import os.path @@ -8,6 +26,8 @@ import argparse import yaml import subprocess +import time +======= ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: @@ -428,10 +448,6 @@ def cleanup(tmp, tmp_path): if tmp == False: os.system('rm -r ' + tmp_path) -def checkOptions(): - pass - #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -520,7 +536,10 @@ def main(): #################### handle user input ######################################## - version = '0.0.1' + start = time.time() + + version = '0.1.1' + parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -796,6 +815,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + fas = time.time() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,6 +836,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -831,6 +852,9 @@ def main(): else: cleanup(tmp, out + "/tmp/") + end = time.time() + sys.stdout = sys.__stdout__ + #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() if __name__ == '__main__': diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index ff81e88..2e8ff02 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -85,6 +85,7 @@ dependenciesUbuntu=( perl-doc locales lib32z1 + augustus ) dependenciesMac=( @@ -94,6 +95,7 @@ dependenciesMac=( mafft brewsci/bio/muscle blast + augustus ) if [ "$sys" == "Darwin" ]; then @@ -108,7 +110,11 @@ else sudo apt-get update -y for i in "${dependenciesUbuntu[@]}"; do echo $i - sudo apt-get install -y -qq $i > /dev/null + if ["$i" == "augustus"]; then + sudo apt install augustus > /dev/null + else + sudo apt-get install -y -qq $i > /dev/null + fi done fi @@ -119,6 +125,7 @@ dependencies=( mafft muscle blastn + augustus ) for i in "${dependencies[@]}"; do diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 3c561e7..1f74552 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -309,6 +309,8 @@ mafft muscle clustalw blastp +augustus +tblastn ) for i in "${dependencies[@]}"; do @@ -318,6 +320,14 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi + if [ $tool == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi + fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" flag=1 diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index ddc4e23..7b4bd08 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -369,6 +369,8 @@ clustalw mafft muscle fasta3 +augustus +tblastn ) for i in "${condaPkgs[@]}"; do if [[ -z $(conda list | $grepprog "$i ") ]]; then @@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do progname="hmmsearch" elif [ "$i" == "fasta3" ]; then progname="fasta36" + elif [ "$i" == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $progname)" ]; then echo -e "\t\e[31m$i could not be installed\e[0m" From 1b4232e6cd214650007e5c24055f3c8618fe01ae Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:41:02 +0200 Subject: [PATCH 107/192] Added link to fDOG-Assembly poster for QfO 6.5 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 52f11e2..9343943 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) +# Poster fDOG - Assembly +(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf) # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) From 4798b8fea54782a68d60935bb157ad28cfeaaadb Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:41:22 +0200 Subject: [PATCH 108/192] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9343943..8db83ce 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) # Poster fDOG - Assembly -(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf) +https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) From d64177c3cb0a6afd8a89687a4ee8196f2f85fc7d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 10:22:47 +0200 Subject: [PATCH 109/192] added option checkOff --- fdog/fDOGassembly.py | 8 +++++--- fdog/runMulti.py | 8 ++++---- fdog/runSingle.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 46f83c0..424b6e3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Hannah Muelbaier +# Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog # searches on genome assemblies @@ -538,7 +538,7 @@ def main(): start = time.time() - version = '0.1.1' + version = '0.1.2' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -668,7 +668,6 @@ def main(): else: sys.stdout = Logger(f) - # user input has to be checked here before fDOGassembly continues assembly_names = os.listdir(assemblyDir) ########################## some variables ################################## @@ -683,6 +682,9 @@ def main(): consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + ##################### need a check to see if reference species is part of the core group !########## + + ###################### create tmp folder ################################### cmd = 'mkdir ' + out + '/tmp' diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 6862f6d..c19b598 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,7 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args + cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args mute = False @@ -70,7 +70,7 @@ def prepare(args, step): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, True] + otherArgs = [cpu, hyperthread, checkOff, debug, True] assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) @@ -378,7 +378,7 @@ def main(): silent = False else: silent = True - + #fdog_goes_assembly arguments assembly = args.assembly assemblyFile = args.assemblyFile @@ -472,7 +472,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 1b8a943..c65300f 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -453,7 +453,7 @@ def main(): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, silent] + otherArgs = [cpu, hyperthread, checkOff, debug, silent] assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog From ef6b0dc6903837130cbac00ea9d6f499e1330373 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 11:17:54 +0200 Subject: [PATCH 110/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 424b6e3..1800a0a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ import yaml import subprocess import time -======= + ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: From d4bf11fb965dcd512790f3ec164c237deaa3a9d4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:06:06 +0200 Subject: [PATCH 111/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1800a0a..bf272d7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,7 +837,7 @@ def main(): cleanup(tmp, tmp_path) return 1 #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From 62badce99d56fb4e634335e20db8cafebcfd89a3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:10:35 +0200 Subject: [PATCH 112/192] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..2b5eaf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,6 +838,8 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + print(len(assembly_names)) + print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From a51b8f4a0c0b60a33d47f4908efa8630bd67dfca Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:11:58 +0200 Subject: [PATCH 113/192] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..2b5eaf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,6 +838,8 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + print(len(assembly_names)) + print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From 147bbc9df5d5bf36382ddd222b9c081d061a3797 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:21:26 +0200 Subject: [PATCH 114/192] fixed --- fdog/fDOGassembly.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2b5eaf7..bf272d7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,8 +838,6 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: - print(len(assembly_names)) - print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From a992e322ca4fd459de9d0d99d867622548dc1af7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 14:49:14 +0200 Subject: [PATCH 115/192] fixed FAS call --- fdog/fDOGassembly.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..9c12e9a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -831,8 +831,7 @@ def main(): #if we searched in more than one Taxon and no ortholog was found - - if refBool == False and searchTaxon == '': + if refBool == False and searchTaxon == '' and len(assembly_names) > 1: print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 @@ -843,7 +842,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') From abea0980ae0ba82e1565d45cabffdb455e85cdce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 15:29:06 +0200 Subject: [PATCH 116/192] changed FAS call --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9c12e9a..950aef3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -823,7 +823,7 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'mkdir ' + tmp_path + 'anno_dir' starting_subprocess(cmd, 'silent') - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') clean_fas(fasOutFile + "_forward.domains", 'domains') clean_fas(fasOutFile + "_reverse.domains", 'domains') @@ -831,12 +831,12 @@ def main(): #if we searched in more than one Taxon and no ortholog was found - if refBool == False and searchTaxon == '' and len(assembly_names) > 1: + if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 #if we searched in more than one taxon - if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + if fasoff == False and searchTaxon == '': fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From d56b83e9cd76ce678b756de5856572d86b31a563 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 14 Sep 2021 16:29:15 +0200 Subject: [PATCH 117/192] new function that checks if input path exist and new function that check if reference species is part of core_group, multiple reference species were accepted, improved output --- fdog/fDOGassembly.py | 78 ++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 950aef3..b27fcbe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -29,6 +29,26 @@ import time ########################### functions ########################################## +def check_path(path): + if not os.path.exists(path): + print(path + " does not exist. Exciting ...") + sys.exit() + +def check_ref_sepc(species_list, fasta_file): + file = open(fasta_file, "r") + lines = file.readlines() + species_file = [] + + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.append(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + def load_config(config_file): with open(config_file, 'r') as stream: try: @@ -298,40 +318,40 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva id, gene, evalue = (line.replace("\n", "")).split("\t") gene_name = gene.split("|")[2] if gene_name != old_name: - print("candidate:%s"%(gene_name)) - print("blast-hit:%s"%(id)) + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" min = float(evalue) if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnothitting\n") + print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnot hitting\n") + print("\tnot hitting\n") if mode == "debug" else "" old_name = gene_name @@ -548,7 +568,7 @@ def main(): required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) + required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) @@ -611,6 +631,7 @@ def main(): silent = args.silent debug = args.debug + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") return 1 @@ -637,22 +658,27 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' - if core_path == '': - core_path = out + '/core_orthologs/' - else: - if not core_path.endswith('/'): - core_path = core_path + '/' - if assemblyDir == '': - assemblyDir = dataPath + '/assembly_dir/' + if out == '': - #print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' else: if out[-1] != "/": out = out + "/" + check_path(out) + + if core_path == '': + core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + check_path(core_path) + + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + check_path(assemblyDir) try: @@ -674,16 +700,20 @@ def main(): refBool = False # checks if sequences of reference species were already part of the extended.fa file - ########### paths ########### + ################################# paths #################################### msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" - ##################### need a check to see if reference species is part of the core group !########## + ############## is fDOG reference species part of ortholog group? ########### + fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) ###################### create tmp folder ################################### @@ -842,7 +872,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') From 343199263b697131ca6fcac375aa59b3e10b7458 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 15 Sep 2021 15:11:25 +0200 Subject: [PATCH 118/192] improved user output --- fdog/fDOGassembly.py | 59 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b27fcbe..232090d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -554,22 +554,21 @@ def flush(self): def main(): - #################### handle user input ######################################## + #################### handle user input ##################################### start = time.time() version = '0.1.2' - - + ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) - + ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) - + ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) @@ -592,7 +591,6 @@ def main(): optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) - args = parser.parse_args() # required @@ -711,7 +709,7 @@ def main(): consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" - ############## is fDOG reference species part of ortholog group? ########### + ########### is/are fDOG reference species part of ortholog group? ########## fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) @@ -720,32 +718,33 @@ def main(): cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') + print("Gene: " + group) + print("fDOG reference species: " + fdog_ref_species + " \n") + ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence for gene " + group + " \n") + print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path starting_subprocess(cmd, mode) - print("consensus sequence is finished\n") + print("\t ...finished\n") ######################## block profile ##################################### - print("Building a block profile for gene " + group + " \n") + print("Building a block profile ...") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: - print("block profile is finished \n") + print("\t ...finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" - #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - #print(cmd) starting_subprocess(cmd, 'silent') - print("block profile is finished \n") + print(" \t ...finished \n") searchBool = False @@ -780,19 +779,17 @@ def main(): #checks if data base exists already db_check = searching_for_db(db_path) if db_check == 0: - print("creating a blast data base \n") + print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - print("database is finished \n") - else: - print('blast data base exists already, continuing...') + print("\t ...finished \n") - #makes a tBLASTn search against the new database + #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("tBLASTn search against data base") + print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' starting_subprocess(cmd, mode) - print("tBLASTn search is finished") + print("\t ...finished") ################### search for candidate regions and extract seq ########### # parse blast and filter for candiate regions @@ -800,25 +797,25 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found") + print("No candidate region found!\n") if refBool == True: continue else: taxa = [fdog_ref_species] reciprocal_sequences = 0 else: - print(str(number_regions) + " candiate regions were found. Extracting sequences...") + print(str(number_regions) + " candiate regions were found.\n") extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("starting augustus ppx \n") + print("Starting augustus ppx ...") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") + print("\t ...finished \n") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + print("No genes found at candidate region\n") if searchTaxon == '' and refBool == True: continue else: @@ -831,7 +828,7 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: if regions != 0: - print("No ortholog fulfilled the reciprocity criteria") + print("No ortholog fulfilled the reciprocity criteria \n") if searchTaxon == '' and refBool == True: continue else: @@ -848,7 +845,7 @@ def main(): # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'mkdir ' + tmp_path + 'anno_dir' @@ -858,6 +855,7 @@ def main(): clean_fas(fasOutFile + "_forward.domains", 'domains') clean_fas(fasOutFile + "_reverse.domains", 'domains') clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") #if we searched in more than one Taxon and no ortholog was found @@ -868,7 +866,7 @@ def main(): #if we searched in more than one taxon if fasoff == False and searchTaxon == '': fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -877,6 +875,7 @@ def main(): clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) @@ -886,7 +885,9 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) + print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.") f.close() + if __name__ == '__main__': main() From a843bfeec60a534776ec3f1e7c036c880a7b2e74 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 17 Sep 2021 10:52:37 +0200 Subject: [PATCH 119/192] added timeout for tblastn search, fixed bug during delition of tmp folder, --- fdog/fDOGassembly.py | 56 ++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 232090d..c54590c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,6 +27,7 @@ import yaml import subprocess import time +import shutil ########################### functions ########################################## def check_path(path): @@ -56,13 +57,17 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) -def starting_subprocess(cmd, mode): - if mode == 'debug': - result = subprocess.run(cmd, shell=True) - elif mode == 'silent': - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - elif mode == 'normal': - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + except subprocess.TimeoutExpired: + return 1 def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions @@ -162,10 +167,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) if blast_results == {}: + blast_file.close() return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - + blast_file.close() return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -270,6 +276,10 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": os.system("muscle -quiet -in " + output_file + " -out " + aln_file) #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if not os.path.exists(aln_file): + print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + elif msaTool == "mafft-linsi": #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) @@ -461,12 +471,13 @@ def createFasInput(orthologsOutFile, mappingFile): ncbi_id = (seq.id.split("@"))[1] mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") - + mappingFile.close() return fas_seed_id def cleanup(tmp, tmp_path): if tmp == False: - os.system('rm -r ' + tmp_path) + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -537,6 +548,7 @@ def clean_fas(path, file_type): new_line = id + "\t" + remain file.write(new_line) + file.close() class Logger(object): def __init__(self, file): @@ -708,6 +720,7 @@ def main(): check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + tmp_folder = out + "/tmp" ########### is/are fDOG reference species part of ortholog group? ########## @@ -760,7 +773,7 @@ def main(): cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') - tmp_path = out + "/tmp/" + asName + "/" + tmp_path = out + "tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" @@ -788,8 +801,14 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - starting_subprocess(cmd, mode) - print("\t ...finished") + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") ################### search for candidate regions and extract seq ########### # parse blast and filter for candiate regions @@ -861,7 +880,8 @@ def main(): #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") - cleanup(tmp, tmp_path) + f.close() + cleanup(tmp, tmp_folder) return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': @@ -878,14 +898,16 @@ def main(): print("\t ...finished \n") ################# remove tmp folder ######################################## if searchTaxon != '': - cleanup(tmp, tmp_path) + f.close() + cleanup(tmp, tmp_folder) else: - cleanup(tmp, out + "/tmp/") + f.close() + cleanup(tmp, tmp_folder) end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.") + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") f.close() From 36fc207095c5f865547ad6a5b152632ebb71f575 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 20 Sep 2021 16:45:31 +0200 Subject: [PATCH 120/192] added options force and append --- fdog/fDOGassembly.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c54590c..09795e4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -28,6 +28,7 @@ import subprocess import time import shutil +import multiprocessing as mp ########################### functions ########################################## def check_path(path): @@ -602,6 +603,8 @@ def main(): optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) args = parser.parse_args() @@ -640,6 +643,8 @@ def main(): searchTaxon = args.searchTaxon silent = args.silent debug = args.debug + force = args.force + append = args.append # output modes if debug == True and silent == True: @@ -672,13 +677,25 @@ def main(): if out == '': out = os.getcwd() - os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') - out = out + '/' + group + '/' else: if out[-1] != "/": out = out + "/" check_path(out) + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + elif append == True: + refBool = True # checks if sequences of reference species were already part of the extended.fa file + else: + refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + if core_path == '': core_path = out + '/core_orthologs/' else: @@ -704,11 +721,9 @@ def main(): else: sys.stdout = Logger(f) - assembly_names = os.listdir(assemblyDir) + ########################### other variables ################################ - ########################## some variables ################################## - - refBool = False # checks if sequences of reference species were already part of the extended.fa file + assembly_names = os.listdir(assemblyDir) ################################# paths #################################### From 2e17db197f2e3e70f0c372a56314fc4722647770 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 23 Sep 2021 13:59:58 +0200 Subject: [PATCH 121/192] tested --foce and --append, only the 10 best candidate regions (regarding score) will be evaluated --- fdog/fDOGassembly.py | 65 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 09795e4..ae29b29 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -72,13 +72,13 @@ def starting_subprocess(cmd, mode, time_out = None): def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} number_regions = 0 insert_length = int(insert_length) + score_list = [] for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) - #print("test") - #print(locations) size_list = len(locations) j = 0 while j < size_list-1: @@ -88,6 +88,8 @@ def merge(blast_results, insert_length): #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -95,6 +97,8 @@ def merge(blast_results, insert_length): #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -102,6 +106,8 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 @@ -109,20 +115,24 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 i += 1 j += 1 + for entry in locations: + score_list.append(entry[6]) number_regions += len(locations) blast_results[key] = locations - return blast_results, number_regions + return blast_results, number_regions, score_list def parse_blast(line, blast_results, cutoff): - # format blast line: - # format dictionary: {node_name: [(,,evalue, ,,)]} + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} line = line.replace("\n", "") line_info = line.split("\t") evalue = float(line_info[3]) @@ -131,7 +141,7 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: @@ -145,14 +155,32 @@ def parse_blast(line, blast_results, cutoff): node_name = split[1] if node_name in blast_results: list = blast_results[node_name] - list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) blast_results[node_name] = list else: - blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] return blast_results, evalue -def candidate_regions(intron_length, cutoff_evalue, tmp_path): +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -171,8 +199,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_file.close() return 0,0 else: - candidate_regions, number_regions = merge(blast_results, intron_length) + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -551,6 +581,10 @@ def clean_fas(path, file_type): file.write(new_line) file.close() +def ortholog_search(): + + pass + class Logger(object): def __init__(self, file): self.file = file @@ -583,7 +617,7 @@ def main(): required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) @@ -688,13 +722,18 @@ def main(): sys.exit() elif force == True: shutil.rmtree(out + '/' + group, ignore_errors=True) + refBool = False + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' elif append == True: - refBool = True # checks if sequences of reference species were already part of the extended.fa file + out = out + '/' + group + '/' + refBool = True else: refBool = False # checks if sequences of reference species were already part of the extended.fa file else: os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' + refBool = False if core_path == '': core_path = out + '/core_orthologs/' @@ -815,7 +854,7 @@ def main(): #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' exit_code = starting_subprocess(cmd, mode, 3600) if exit_code == 1: print("The tblastn search takes too long. Exciting ...") From 80562870c5d6395f3aa9cb256281dea1c157104a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 28 Sep 2021 16:07:06 +0200 Subject: [PATCH 122/192] create a function that performs the ortholog search and returns the headers of the found sequences and the corresponding tmp file in which the sequence is located --- fdog/fDOGassembly.py | 315 +++++++++++++++++++++++++++++-------------- 1 file changed, 214 insertions(+), 101 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ae29b29..37b7095 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -464,6 +464,38 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set(orthologs) return list(orthologs), seed +def addRef(output, core_fasta, species_list): + print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + candidate_fasta = item[0] + sequenceIds = item[1] + if sequenceIds == 0 or sequenceIds == []: + pass + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): output_file = open(output, "a+") @@ -581,9 +613,69 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(): - - pass +def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs): + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + #orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + #mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + db_check = searching_for_db(db_path) + + if db_check == 0: + print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile + + else: + print(str(number_regions) + " candiate regions were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + print("Starting augustus ppx ...") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("\t ...finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate regions\n") + return [], candidatesOutFile + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + return reciprocal_sequences, candidatesOutFile class Logger(object): def __init__(self, file): @@ -639,7 +731,7 @@ def main(): optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) args = parser.parse_args() # required @@ -679,6 +771,7 @@ def main(): debug = args.debug force = args.force append = args.append + parallel = args.parallel # output modes if debug == True and silent == True: @@ -815,120 +908,140 @@ def main(): searchBool = False - #################### fDOG assembly computation for all species ############# - for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True + if searchTaxon == '': + ortholog_sequences = [] + cpus = mp.cpu_count() + print(cpus) + #pool = mp.Pool(cpus) + for asName in assembly_names: + reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) + ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) + + orthologsOutFile = out + "/" + group + ".extended.fa" + + if taxa == []: + taxa = [fdog_ref_species] + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + refBool = True + mappingFile = out + "/tmp/" + group + ".mapping.txt" - ################### path definitions ################################### - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("Creating a blast data base...") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("\t ...finished \n") - - #makes a tBLASTn search against database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - exit_code = starting_subprocess(cmd, mode, 3600) - if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - sys.exit() - else: - print("\t ...finished") + else: + #################### fDOG assembly computation for all species ############# + for asName in assembly_names: + if searchBool == True: + break + if searchTaxon != '' and searchBool == False: + asName = searchTaxon + searchBool = True - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + ################### path definitions ################################### - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found!\n") - if refBool == True: - continue + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + if searchTaxon != '': + orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" + fasOutFile = out + "/" + group + "_" + asName + mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found.\n") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### + orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + + ######################## tBLASTn ########################################### + #checks if data base exists already + db_check = searching_for_db(db_path) + if db_check == 0: + print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") - print("Starting augustus ppx ...") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") + ################### search for candidate regions and extract seq ########### + # parse blast and filter for candiate regions + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate region\n") - if searchTaxon == '' and refBool == True: + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found!\n") + if refBool == True: continue else: - reciprocal_sequences = 0 taxa = [fdog_ref_species] + reciprocal_sequences = 0 else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + print(str(number_regions) + " candiate regions were found.\n") + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + + print("Starting augustus ppx ...") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("\t ...finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate region\n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + taxa = [fdog_ref_species] + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria \n") - if searchTaxon == '' and refBool == True: - continue + ################## checking accepted genes for co-orthologs ################ + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria \n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - ################ add sequences to extended.fa in the output folder########## + ################ add sequences to extended.fa in the output folder########## - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + refBool = True - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores ...") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - print("\t ...finished \n") + ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon + if searchTaxon != '' and fasoff == False: + fas = time.time() + print("Calculating FAS scores ...") + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + starting_subprocess(cmd, 'silent') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") #if we searched in more than one Taxon and no ortholog was found From ee3636413a9a826d523229d21c9d4e5b88113fe3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 29 Sep 2021 16:10:44 +0200 Subject: [PATCH 123/192] added parallelization with bib multiprocessing --- fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 37b7095..aadb3f0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -480,8 +480,9 @@ def addSeq(output, seq_list): output_file = open(output, "a+") for item in seq_list: - candidate_fasta = item[0] - sequenceIds = item[1] + print(item) + candidate_fasta = item[1] + sequenceIds = item[0] if sequenceIds == 0 or sequenceIds == []: pass seq_records_candidate = readFasta(candidate_fasta) @@ -613,7 +614,8 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs): +def ortholog_search(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') tmp_path = out + "tmp/" + asName + "/" @@ -628,23 +630,23 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci db_check = searching_for_db(db_path) if db_check == 0: - print("Creating a blast data base...") + #print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - print("\t ...finished \n") + #print("\t ...finished \n") #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") + #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' exit_code = starting_subprocess(cmd, mode, 3600) if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") + print("The tblastn search takes too long for species %s. Exciting ..." % asName) f.close() cleanup(tmp, tmp_folder) sys.exit() - else: - print("\t ...finished") + #else: + #print("\t ...finished") regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: @@ -657,13 +659,13 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("Starting augustus ppx ...") + #print("Starting augustus ppx ...") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") + #print("\t ...finished \n") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + #print("No genes found at candidate regions\n") return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -910,15 +912,20 @@ def main(): if searchTaxon == '': ortholog_sequences = [] + calls = [] cpus = mp.cpu_count() - print(cpus) - #pool = mp.Pool(cpus) + pool = mp.Pool(cpus) for asName in assembly_names: - reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) - ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) - + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + #for asName in assembly_names: + #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) + #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() orthologsOutFile = out + "/" + group + ".extended.fa" - + for i in results: + ortholog_sequences.append(i) if taxa == []: taxa = [fdog_ref_species] addRef(orthologsOutFile, fasta_path, taxa) From da8cdcc67d7ae8306c73deb4421e0d2b9078689a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 1 Oct 2021 10:57:19 +0200 Subject: [PATCH 124/192] added output for computational time --- fdog/fDOGassembly.py | 81 +++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index aadb3f0..97ec269 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -341,7 +341,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva try: id_ref = seedDic[fdog_ref_species] except KeyError: - print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -397,7 +397,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva if orthologs == []: - print("No hit in the backward search, ...exciting") + #print("No hit in the backward search, ...exciting") return 0, seed else: @@ -422,12 +422,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species " + species + "\n") + print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: - print("The species " + species + " isn't part of the core ortholog group, ... exciting") + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -450,12 +450,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(species) #print(orthologs_new) + #print(orthologs) if species == fdog_ref_species: orthologs = orthologs_new else: orthologs = orthologs & orthologs_new - if orthologs == {}: - print("No ortholog was found with option --strict") + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") return 0, seed @@ -465,7 +466,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addRef(output, core_fasta, species_list): - print(species_list) + #print(species_list) output_file = open(output, "a+") seq_records_core = readFasta(core_fasta) seq_records_core = list(seq_records_core) @@ -480,7 +481,7 @@ def addSeq(output, seq_list): output_file = open(output, "a+") for item in seq_list: - print(item) + #print(item) candidate_fasta = item[1] sequenceIds = item[0] if sequenceIds == 0 or sequenceIds == []: @@ -540,8 +541,12 @@ def createFasInput(orthologsOutFile, mappingFile): def cleanup(tmp, tmp_path): if tmp == False: + timeout = time.time() + 60*1 while os.path.exists(tmp_path): shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -639,7 +644,10 @@ def ortholog_search(args): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: print("The tblastn search takes too long for species %s. Exciting ..." % asName) f.close() @@ -647,6 +655,7 @@ def ortholog_search(args): sys.exit() #else: #print("\t ...finished") + print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: @@ -655,13 +664,17 @@ def ortholog_search(args): return [], candidatesOutFile else: - print(str(number_regions) + " candiate regions were found for species %s.\n" % asName) + print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### #print("Starting augustus ppx ...") + time_augustus_start = time.time() augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -884,7 +897,7 @@ def main(): print("fDOG reference species: " + fdog_ref_species + " \n") ######################## consensus sequence ################################ - + group_computation_time_start = time.time() #make a majority-rule consensus sequence with the tool hmmemit from hmmer print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path @@ -908,24 +921,35 @@ def main(): starting_subprocess(cmd, 'silent') print(" \t ...finished \n") + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + searchBool = False if searchTaxon == '': ortholog_sequences = [] - calls = [] - cpus = mp.cpu_count() - pool = mp.Pool(cpus) - for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) - #for asName in assembly_names: - #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) - #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) - results = (pool.imap_unordered(ortholog_search, calls)) - pool.close() - pool.join() + time_ortholog_start = time.time() + if parallel == True: + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) + else: + for asName in assembly_names: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + orthologsOutFile = out + "/" + group + ".extended.fa" - for i in results: - ortholog_sequences.append(i) + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start if taxa == []: taxa = [fdog_ref_species] addRef(orthologsOutFile, fasta_path, taxa) @@ -1071,6 +1095,11 @@ def main(): clean_fas(out + group + ".phyloprofile", 'phyloprofile') print("\t ...finished \n") ################# remove tmp folder ######################################## + end = time.time() + time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") + print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + sys.stdout = sys.__stdout__ if searchTaxon != '': f.close() cleanup(tmp, tmp_folder) @@ -1078,11 +1107,9 @@ def main(): f.close() cleanup(tmp, tmp_folder) - end = time.time() - sys.stdout = sys.__stdout__ + #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") - f.close() + if __name__ == '__main__': From ba752aa04f5ccf706982b3647499396c0064137d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 13:29:27 +0200 Subject: [PATCH 125/192] updated fDOG-Assembly structure. fDOG-Assembly is now a separate script and can only be started with the command fdog.assembly --- fdog/bin/oneSeq.pl | 125 +++----------------- fdog/fDOGassembly.py | 223 ++++++++---------------------------- fdog/mergeAssemblyOutput.py | 124 -------------------- fdog/runMulti.py | 31 +---- fdog/runSingle.py | 64 +---------- 5 files changed, 74 insertions(+), 493 deletions(-) delete mode 100644 fdog/mergeAssemblyOutput.py diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 1b0839f..a99e1e6 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -207,7 +207,6 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; -my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -312,15 +311,6 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; -#variables for fdog_goes_assembly -my $assembly; -my $augustusRefSpec; -my $avIntron; -my $lengthExtension; -my $assemblyPath; -my $searchTool = 'blast'; -my $matrix = 'blosum62'; -my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -383,15 +373,7 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa, - "assembly" => \$assembly, - "assemblypath=s" => \$assemblyPath, - "augustusRefSpec=s" => \$augustusRefSpec, - "avIntron=s" => \$avIntron, - "lengthExtension=s" => \$lengthExtension, - "searchTool=s" => \$searchTool, - "scoringmatrix=s" => \$matrix, - "dataPath=s" => \$dataPath + "searchTaxa=s" => \$searchTaxa ); $outputPath = abs_path($outputPath); @@ -403,8 +385,6 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; -$dataPath = abs_path($dataPath)."/"; -$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { @@ -414,7 +394,7 @@ initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); } - if (!defined $coreex && !defined $assembly) { + if (!defined $coreex) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -498,7 +478,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support && !$assembly){ +if ($fas_support){ if (!$weightPath) { createWeightFolder(); } @@ -507,7 +487,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex && !$assembly) { +if (!$coreex) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -645,12 +625,7 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if (!$assembly){ - $taxaPath = $genome_dir; - } - else{ - $taxaPath = $assembly_dir; - } + $taxaPath = $genome_dir; my @searchTaxa; unless ($searchTaxa) { unless($groupNode) { @@ -706,63 +681,7 @@ } } } - if ($assembly){ - $eval_blast = sprintf("%f", $eval_blast); - if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); - - if (defined $assemblyPath){ - push(@assembly_cmd, "--assemblyPath $assemblyPath") - } - if (defined $avIntron){ - push(@assembly_cmd, "--avIntron $avIntron "); - } - if (defined $lengthExtension){ - push(@assembly_cmd, "--lengthExtension $lengthExtension "); - } - if (!$autoclean){ - push(@assembly_cmd, "--tmp "); - } - if ($outputPath){ - push(@assembly_cmd, "--out $outputPath "); - } - if (defined $strict){ - push(@assembly_cmd, "--strict"); - } - if ($eval_blast){ - push(@assembly_cmd, "--evalBlast $eval_blast "); - } - if ($searchTool){ - push(@assembly_cmd, "--msaTool $aln "); - } - if (defined $checkcoorthologsref){ - push(@assembly_cmd, "--checkCoorthologsRef"); - } - if ($searchTool){ - push(@assembly_cmd, "--searchTool $searchTool"); - } - if ($matrix){ - push(@assembly_cmd, "--scoringmatrix $matrix"); - } - if ($coreOrthologsPath){ - push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); - } - if ($fasoff){ - push(@assembly_cmd, "--fasoff"); - } - if ($searchTaxon){ - push(@assembly_cmd, "--searchTaxon $searchTaxon"); - } - if ($filter){ - push(@assembly_cmd, "--filter $filter"); - } - printDebug(@assembly_cmd); - system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; - } - } - else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); - } $pm->finish; } $pm->wait_all_children; @@ -774,8 +693,8 @@ push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -if(!$coreOnly && !$assembly){ +## Evaluation of all orthologs that are predicted by the final run +if(!$coreOnly){ my $fasStTime = gettime(); my $processID = $$; @@ -787,7 +706,7 @@ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support && !$assembly) { + if ($fas_support) { print "Starting the feature architecture similarity score computation...\n"; my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; unless ($countercheck) { @@ -800,21 +719,12 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } -if ($assembly){ - my $file_assembly_out; - $file_assembly_out = $outputPath . '/' . $seqName; - my $cmd_merge; - $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; - printDebug($cmd_merge); - system($cmd_merge); -} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -1224,10 +1134,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set if options reuseCore or assembly is used - if ($coreex || $assembly) { + ### check the presence of the pre-computed core set + if ($coreex) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1298,7 +1208,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and (!$coreex and !$assembly)) { + while(!$minCoreOrthologs and !$coreex) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1313,12 +1223,10 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - if (!$assembly){ - $inputSeq = fetchSequence($seqFile, $dataDir); - } + $inputSeq = fetchSequence($seqFile, $dataDir); ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '' && !$assembly) { + if($seqId eq '') { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1445,9 +1353,8 @@ sub checkOptions { #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex and !$assembly) { + if (!$coreex) { my $node; - #print "Testing coreex assembly\n"; $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ @@ -2709,7 +2616,7 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1 && !$assembly) { + if ($fasoff != 1) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); my @notFolder; diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 97ec269..eb9dc41 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -737,11 +737,11 @@ def main(): optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') - optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) @@ -776,12 +776,8 @@ def main(): msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa - if taxa == '': - taxa =[] - else: - taxa = taxa.split(",") fasoff = args.fasoff - searchTaxon = args.searchTaxon + searchTaxa = args.searchTaxa silent = args.silent debug = args.debug force = args.force @@ -816,7 +812,6 @@ def main(): except: dataPath = 'config' - if out == '': out = os.getcwd() else: @@ -854,7 +849,6 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' check_path(assemblyDir) - try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: @@ -869,8 +863,15 @@ def main(): sys.stdout = Logger(f) ########################### other variables ################################ - - assembly_names = os.listdir(assemblyDir) + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa ################################# paths #################################### @@ -924,170 +925,48 @@ def main(): group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start - searchBool = False - - if searchTaxon == '': - ortholog_sequences = [] - time_ortholog_start = time.time() - if parallel == True: - calls = [] - cpus = mp.cpu_count() - pool = mp.Pool(cpus) - for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) - - results = (pool.imap_unordered(ortholog_search, calls)) - pool.close() - pool.join() - for i in results: - ortholog_sequences.append(i) - else: - for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile = ortholog_search(args) - ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) - - orthologsOutFile = out + "/" + group + ".extended.fa" - time_ortholog_end = time.time() - time_ortholog = time_ortholog_end - time_ortholog_start - if taxa == []: - taxa = [fdog_ref_species] - addRef(orthologsOutFile, fasta_path, taxa) - addSeq(orthologsOutFile, ortholog_sequences) - refBool = True - mappingFile = out + "/tmp/" + group + ".mapping.txt" + ###################### ortholog search ##################################### + ortholog_sequences = [] + time_ortholog_start = time.time() + if parallel == True: + ##################### parallel compuataion ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) else: - #################### fDOG assembly computation for all species ############# + ###################### computation species per species ################ for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True - - ################### path definitions ################################### - - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("Creating a blast data base...") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("\t ...finished \n") - - #makes a tBLASTn search against database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - exit_code = starting_subprocess(cmd, mode, 3600) - if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - sys.exit() - else: - print("\t ...finished") - - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found!\n") - if refBool == True: - continue - else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found.\n") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### - - print("Starting augustus ppx ...") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") - - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate region\n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - taxa = [fdog_ref_species] - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - - - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria \n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - - ################ add sequences to extended.fa in the output folder########## - - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + mappingFile = out + "/tmp/" + group + ".mapping.txt" - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores ...") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - print("\t ...finished \n") - - - #if we searched in more than one Taxon and no ortholog was found - if refBool == False and searchTaxon == '': - print("No orthologs found. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - return 1 - #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False: fas = time.time() print("Calculating FAS scores ...") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') @@ -1100,17 +979,9 @@ def main(): print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - if searchTaxon != '': - f.close() - cleanup(tmp, tmp_folder) - else: - f.close() - cleanup(tmp, tmp_folder) - - - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - + f.close() + cleanup(tmp, tmp_folder) if __name__ == '__main__': main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py deleted file mode 100644 index 1606b1d..0000000 --- a/fdog/mergeAssemblyOutput.py +++ /dev/null @@ -1,124 +0,0 @@ -# -*- coding: utf-8 -*- - -####################################################################### -# Copyright (C) 2020 Vinh Tran -# -# This script is used to merge all output files (.extended.fa, .phyloprofile, -# _forward.domains, _reverse.domains) in a given directory into one file each. -# -# This script is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for -# more details -# -# Contact: hannah.muelbaier@stud.uni-frankfurt.de -# -####################################################################### - -import sys -import os -from os import listdir as ldir -import argparse -from pathlib import Path - -def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', - action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) - parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) - args = parser.parse_args() - - directory = args.input - out = args.output - cleanup = args.cleanup - if not os.path.exists(os.path.abspath(directory)): - sys.exit('%s not found' % directory) - else: - directory = os.path.abspath(directory) - - phyloprofile = None - set_phylo = set() - domains_0 = None - set_domains_f = set() - domains_1 = None - set_domains_r = set() - ex_fasta = None - set_fasta = set() - header_bool = False - for infile in ldir(directory): - if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': - if not phyloprofile: - phyloprofile = open(out + '.phyloprofile', 'w') - phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: - phyloprofile.write(line) - if len(lines) > 1: - set_phylo = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': - if not domains_0: - domains_0 = open(out + '_forward.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_f: - domains_0.write(line) - if len(lines) > 1: - set_domains_f = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': - if not domains_1: - domains_1 = open(out + '_reverse.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_r: - domains_1.write(line) - if len(lines) > 1: - set_domains_r = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': - if not ex_fasta: - ex_fasta = open(out + '.extended.fa', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - header = set() - #print(set_fasta) - for line in lines: - if line[0] == ">": - header.add(line) - if line not in set_fasta: - ex_fasta.write(line) - header_bool = True - else: - header_bool = False - else: - if header_bool == True: - ex_fasta.write(line) - set_fasta = header - if cleanup == True: - os.remove(directory + '/' +infile) - elif infile.endswith('.tsv'): - os.remove(directory + '/' + infile) - - if phyloprofile: - phyloprofile.close() - if domains_0: - domains_0.close() - if domains_1: - domains_1.close() - if ex_fasta: - ex_fasta.close() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index c19b598..c19b0ff 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,8 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args - + cpu, hyperthread, checkOff, debug, silent) = args mute = False if step == 'core': @@ -71,9 +70,7 @@ def prepare(args, step): fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, checkOff, debug, True] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) - + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -108,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) if len(coreCompilationJobs) > 0: pool = mp.Pool(cpu) coreOut = [] @@ -133,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -295,14 +291,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -379,15 +367,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### check fas if not fasoff: try: @@ -472,7 +451,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index c65300f..c4abb82 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) + (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -163,28 +163,7 @@ def runSingle(args): cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # add assembly options - (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs - if assembly == True: - cmd = cmd + ' -assembly' - cmd = cmd + ' -reuseCore' - if not augustusRefSpec == '': - cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec - else: - sys.exit('An augutus reference species is requiered by using the option --assembly') - if not avIntron == '': - cmd = cmd + ' -avIntron=%s' % avIntron - if not lengthExtension == '': - cmd = cmd + ' -lengthExtension=%s' % lengthExtension - if not assemblyFile == '': - cmd = cmd + ' -assemblyFile=%s' % assemblyFile - if not searchTool == '': - cmd = cmd + ' -searchTool=%s' % searchTool - if not matrix == '': - cmd = cmd + ' -scoringmatrix=%s' % matrix - if not dataPath == '': - cmd = cmd + ' -dataPath=%s' % dataPath - #print(cmd) + # print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -211,8 +190,6 @@ def main(): optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -295,14 +272,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -322,7 +291,6 @@ def main(): searchpath = args.searchpath weightpath = args.weightpath pathFile = args.pathFile - assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -378,15 +346,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### get fdog and data path dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') @@ -434,30 +393,19 @@ def main(): except: sys.exit('weightpath not found in %s' % pathFile) - if assemblypath == '': - assemblypath = dataPath + '/assembly_dir' - if dataPath == 'config': - try: - assemblypath = cfg['assemblypath'] - except: - sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath - ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, checkOff, debug, silent] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) if __name__ == '__main__': main() From 49a430b913970276efa3c69af3ca9007f7b5e3c9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 15:56:40 +0200 Subject: [PATCH 126/192] testing addSeq function --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index eb9dc41..3940f04 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -484,6 +484,8 @@ def addSeq(output, seq_list): #print(item) candidate_fasta = item[1] sequenceIds = item[0] + print(sequenceIds) + print(type(sequenceIds)) if sequenceIds == 0 or sequenceIds == []: pass seq_records_candidate = readFasta(candidate_fasta) From e18872b31a0c5b013a2c7bdece18674c3b8c5974 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 16:02:13 +0200 Subject: [PATCH 127/192] bug fix in addSeq function --- fdog/fDOGassembly.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3940f04..71beafc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -484,10 +484,8 @@ def addSeq(output, seq_list): #print(item) candidate_fasta = item[1] sequenceIds = item[0] - print(sequenceIds) - print(type(sequenceIds)) if sequenceIds == 0 or sequenceIds == []: - pass + continue seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: From b4d1e0c3f8fb09ca0214c45ff40789e3b56d64b1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 16 Oct 2021 14:40:39 +0200 Subject: [PATCH 128/192] bug fix in ortholog search function --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 71beafc..7c45233 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -650,8 +650,7 @@ def ortholog_search(args): time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: print("The tblastn search takes too long for species %s. Exciting ..." % asName) - f.close() - cleanup(tmp, tmp_folder) + #cleanup(tmp, tmp_folder) sys.exit() #else: #print("\t ...finished") From e85fd1c561df192ebdcd15ddd0c84336baad327f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 17 Oct 2021 12:24:31 +0200 Subject: [PATCH 129/192] bug fix in ortholog search if tblastn takes to long --- fdog/fDOGassembly.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7c45233..0a9df8f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -244,7 +244,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID:" + name + " , continuing with next region") + print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -315,7 +315,12 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - distances = get_distance_biopython(aln_file, matrix) + try: + distances = get_distance_biopython(aln_file, matrix) + except ValueError: + print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" + distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] @@ -374,7 +379,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: @@ -629,7 +635,7 @@ def ortholog_search(args): fasOutFile = out + "/" + group #mappingFile = out + "/tmp/" + group + ".mapping.txt" - print("Searching in species " + asName + "\n") + sys.stdout.write("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" db_check = searching_for_db(db_path) @@ -649,9 +655,10 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - print("The tblastn search takes too long for species %s. Exciting ..." % asName) + sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) - sys.exit() + #sys.exit() + return [], candidatesOutFile #else: #print("\t ...finished") print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) @@ -659,7 +666,7 @@ def ortholog_search(args): regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found for species %s!\n" % asName) + sys.stdout.write("No candidate region found for species %s!\n" % asName) return [], candidatesOutFile else: @@ -684,7 +691,7 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: - print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) @@ -976,7 +983,7 @@ def main(): end = time.time() time_fas = end - fas print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") - print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ f.close() From 1f9f736325253c08f33d60bc787136c10f6ef303 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 18 Oct 2021 10:23:16 +0200 Subject: [PATCH 130/192] updated input options --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0a9df8f..e40701b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -727,7 +727,7 @@ def main(): required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) @@ -744,10 +744,10 @@ def main(): optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) - optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[]) + optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) From 42e4ba122504f24a25f748c087d14aa0d199a419 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:51:15 +0200 Subject: [PATCH 131/192] Update fDOG goes assembly to version 0.1.2 (#12) --- fdog/addTaxa.py | 15 +- fdog/addTaxon.py | 62 ++-- fdog/bin/hamstr.pl | 141 ++++----- fdog/bin/oneSeq.pl | 204 +++++-------- fdog/checkData.py | 69 +++-- fdog/fDOGassembly.py | 563 ++++++++++++++++++++++-------------- fdog/mergeAssemblyOutput.py | 124 -------- fdog/mergeOutput.py | 7 +- fdog/removefDog.py | 4 +- fdog/runMulti.py | 46 +-- fdog/runSingle.py | 74 +---- fdog/setup/install_lib.sh | 3 - fdog/setup/setup.sh | 43 +-- fdog/setup/setup_conda.sh | 25 +- setup.py | 4 +- 15 files changed, 648 insertions(+), 736 deletions(-) delete mode 100644 fdog/mergeAssemblyOutput.py diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index d392c8c..fa4a3a1 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -37,6 +37,7 @@ import re import shutil from tqdm import tqdm +from datetime import datetime def checkFileExist(file): if not os.path.exists(os.path.abspath(file)): @@ -68,20 +69,18 @@ def parseMapFile(mappingFile): try: ver = tmp[3].strip() except: - ver = 1 + ver = datetime.today().strftime('%y%m%d') #1 # print(taxName+"@"+str(taxId)+"@"+str(ver)) nameDict[fileName] = (taxName, str(taxId), str(ver)) return(nameDict) def runAddTaxon(args): - (f,n,i,o,c,v,a,cpus,replace,delete,oldFAS) = args + (f,n,i,o,c,v,a,cpus,replace,delete) = args cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus) if c == True: cmd = cmd + ' -c' if a == True: cmd = cmd + ' -a' - if oldFAS == True: - cmd = cmd + ' --oldFAS' if replace == True: cmd = cmd + ' --replace' if delete == True: @@ -95,7 +94,7 @@ def runAddTaxon(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.5' + version = '0.0.9' parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -105,8 +104,7 @@ def main(): action='store', default='', required=True) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using annoFAS', action='store_true', default=False) - optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) @@ -128,7 +126,6 @@ def main(): outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa - oldFAS = args.oldFAS cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 @@ -171,7 +168,7 @@ def main(): verProt = nameDict[f][2] jobs.append([ folIn + '/' + f, nameDict[f][0], nameDict[f][1], - outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete, oldFAS + outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete ]) if len(dupList) > 0: diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index fe0a810..f962cba 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -32,6 +32,7 @@ import multiprocessing as mp from ete3 import NCBITaxa import re +import shutil from datetime import datetime def checkFileExist(file): @@ -83,7 +84,7 @@ def runBlast(args): os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.5' + version = '0.0.10' parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -91,10 +92,9 @@ def main(): required.add_argument('-i', '--taxid', help='Taxonomy ID of input taxon', action='store', default='', required=True, type=int) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str) - optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default=1, type=str) + optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str) optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using annoFAS', action='store_true', default=False) - optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) @@ -119,7 +119,8 @@ def main(): noAnno = args.noAnno coreTaxa = args.coreTaxa ver = str(args.verProt) - oldFAS = args.oldFAS + if ver == '': + ver = datetime.today().strftime('%y%m%d') cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 @@ -135,6 +136,13 @@ def main(): specName = name+'@'+taxId+'@'+ver print('Species name\t%s' % specName) + ### remove old folder if force is set + if force: + if os.path.exists(outPath + '/genome_dir/' + specName): + shutil.rmtree(outPath + '/genome_dir/' + specName) + if os.path.exists(outPath + '/blast_dir/' + specName): + shutil.rmtree(outPath + '/blast_dir/' + specName) + ### create file in genome_dir print('Parsing FASTA file...') Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True) @@ -147,25 +155,30 @@ def main(): f = open(specFile, 'w') index = 0 modIdIndex = 0 - longId = 'no' + # longId = 'no' tmpDict = {} + # with open(specFile + '.mapping', 'a') as mappingFile: for id in inSeq: seq = str(inSeq[id].seq) # check ID - id = re.sub('\|', '_', id) - oriId = id - if len(id) > 30: - modIdIndex = modIdIndex + 1 - id = specName + "_" + str(modIdIndex) - longId = 'yes' - with open(specFile + '.mapping', 'a') as mappingFile: - mappingFile.write('%s\t%s\n' % (id, oriId)) - if not id in tmpDict: - tmpDict[id] = 1 + # oriId = id + if ' ' in id: + sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) else: - index = index + 1 - id = str(id) + '_' + str(index) - tmpDict[id] = 1 + if '\|' in id: + print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') + id = re.sub('\|', '_', id) + # if len(id) > 20: + # modIdIndex = modIdIndex + 1 + # id = modIdIndex + # longId = 'yes' + # if not id in tmpDict: + # tmpDict[id] = 1 + # else: + # index = index + 1 + # id = str(index) + # tmpDict[id] = 1 + # mappingFile.write('%s\t%s\n' % (id, oriId)) # check seq if seq[-1] == '*': seq = seq[:-1] @@ -187,8 +200,8 @@ def main(): cf.write(str(datetime.now())) cf.close() # warning about long header - if longId == 'yes': - print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) + # if longId == 'yes': + # print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) else: print(genomePath + '/' + specName + '.fa already exists!') @@ -207,16 +220,13 @@ def main(): ### create annotation if not noAnno: Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True) - annoCmd = 'annoFAS -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) + annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) if force: annoCmd = annoCmd + " --force" - if oldFAS: - print("running old version of FAS...") - annoCmd = 'annoFAS -i %s/%s.fa -o %s -n %s --cores %s' % (genomePath, specName, outPath+'/weight_dir', specName, cpus) try: subprocess.call([annoCmd], shell = True) except: - print('\033[91mProblem with running annoFAS. You can check it with this command:\n%s\033[0m' % annoCmd) + print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath)) diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl index 7ff125e..3feb01e 100755 --- a/fdog/bin/hamstr.pl +++ b/fdog/bin/hamstr.pl @@ -195,9 +195,10 @@ ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef ## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name ## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory +## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm ######################## start main ########################################### -my $version = "HaMStR v.13.4.4"; +my $version = "HaMStR v.13.4.5"; ######################## checking whether the configure script has been run ### my $configure = 0; if ($configure == 0){ @@ -315,7 +316,7 @@ my $ublast = 0; my $accel = 0.8; #####determine the hostname####### -push @log, "VERSION:\t$version\n"; +# push @log, "VERSION:\t$version\n"; my $hostname = `hostname`; chomp $hostname; push @log, "HOSTNAME\t$hostname\n"; @@ -520,7 +521,7 @@ exit; } else { - open (OUT, ">$outpath/hamstrsearch.log") or die "could not open logfile\n"; + open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n"; print OUT join "\n", @log; close OUT; } @@ -1059,7 +1060,7 @@ sub checkInput { } } } else { - push @log, "\trunning HaMStR with all hmms in $hmm_dir"; + push @log, "\trunning fDOG with all hmms in $hmm_dir"; my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g; @hmms = `ls $hmm_dir_tmp`; } @@ -1299,10 +1300,10 @@ sub checkInput { } ## 14) determin whether or not the -representative flag has been set if (defined $rep) { - push @log, "\tHaMStR will run with the -representative option"; + push @log, "\tfDOG will run with the -representative option"; } else { - push @log, "\tHaMStR was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; + push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; } ## check further options @@ -1854,68 +1855,68 @@ sub revComp { return($seq); } ############################## -sub parseHmmer3pm { - my ($file, $path) = @_; - my $hits; - my $query; - my %tmphash; - if (!defined $path){ - $path = '.'; - } - $file = $path . '/' . $file; - my $in = Bio::SearchIO->new( - -format => 'hmmer', - -file => $file - ); - while( my $result = $in->next_result ) { - # this is a Bio::Search::Result::HMMERResult object - if (!defined $query){ - $query = $result->query_name(); - printOUT("query is $query\n"); - } - my $hitcount = 0; - while( my $hit = $result->next_hit ) { - my $tmp = $hit->name(); - my $tmpscore = $hit->score(); - $tmp =~ s/_RF.*//; - if (!defined $tmphash{$tmp}){ - $hits->[$hitcount]->{id} = $tmp; - $hits->[$hitcount]->{hmmscore} = $tmpscore; - $hitcount++; - $tmphash{$tmp}=1; - if (defined $bhh){ - last; - } - } - } - - if (defined $hits->[0]) { - ####### a quick hack to obtain the lagPhase value - my $criticalValue; # takes the value used for candidate discrimination - my $hitLimitLoc = $hitlimit; - if (defined $autoLimit) { - printDebug("Entering getLag Routine\n"); - ## the user has invoked the autmated inference of a hit limit - ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); - if (!defined $criticalValue) { - ## there was a problem in the computatation of the lagPhase - print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - } - } - elsif (defined $scoreThreshold) { - printDebug("entering the scoreThreshold routine"); - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - printDebug("hitlimitloc is now $hitLimitLoc"); - } - - return ($query, $hits, $hitLimitLoc, $criticalValue); - } - else { - return ($query); - } - } -} +# sub parseHmmer3pm { +# my ($file, $path) = @_; +# my $hits; +# my $query; +# my %tmphash; +# if (!defined $path){ +# $path = '.'; +# } +# $file = $path . '/' . $file; +# my $in = Bio::SearchIO->new( +# -format => 'hmmer', +# -file => $file +# ); +# while( my $result = $in->next_result ) { +# # this is a Bio::Search::Result::HMMERResult object +# if (!defined $query){ +# $query = $result->query_name(); +# printOUT("query is $query\n"); +# } +# my $hitcount = 0; +# while( my $hit = $result->next_hit ) { +# my $tmp = $hit->name(); +# my $tmpscore = $hit->score(); +# $tmp =~ s/_RF.*//; +# if (!defined $tmphash{$tmp}){ +# $hits->[$hitcount]->{id} = $tmp; +# $hits->[$hitcount]->{hmmscore} = $tmpscore; +# $hitcount++; +# $tmphash{$tmp}=1; +# if (defined $bhh){ +# last; +# } +# } +# } +# +# if (defined $hits->[0]) { +# ####### a quick hack to obtain the lagPhase value +# my $criticalValue; # takes the value used for candidate discrimination +# my $hitLimitLoc = $hitlimit; +# if (defined $autoLimit) { +# printDebug("Entering getLag Routine\n"); +# ## the user has invoked the autmated inference of a hit limit +# ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); +# if (!defined $criticalValue) { +# ## there was a problem in the computatation of the lagPhase +# print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; +# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); +# } +# } +# elsif (defined $scoreThreshold) { +# printDebug("entering the scoreThreshold routine"); +# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); +# printDebug("hitlimitloc is now $hitLimitLoc"); +# } +# +# return ($query, $hits, $hitLimitLoc, $criticalValue); +# } +# else { +# return ($query); +# } +# } +# } ############################## sub parseHmmer4pm { my ($file, $path) = @_; @@ -1931,9 +1932,9 @@ sub parseHmmer4pm { $file = $path . '/' . $file; $file =~ s/\|/\\\|/g; - my @hmmout = `$grepprog -v '#' $file |sort -rnk 9 |sed -e 's/ /@/g'`; + my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`; for (my $i = 0; $i < @hmmout; $i++) { - ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/@+/, $hmmout[$i]); + ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]); if (!defined $query){ $query = $hmmhits->[$i]->{query_name}; diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7e8a248..a99e1e6 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -127,13 +127,17 @@ ## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir ## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore ## - solved problem with long input path for fasta36 tools +## Modified 23. April 2021 v2.3.0 (Vinh) - parse fasta36 output for long IDs (longer than 60 chars) +## Modified 31. May 2021 v2.3.1 (Vinh) - added auto annotation for fdogFas +## Modified 11. June 2021 v2.3.2 (Vinh) - fixed --append option +## Modified 16. June 2021 v2.4.0 (Vinh) - add checkOff option ############ General settings -my $version = 'oneSeq v.2.2.9'; +my $version = 'oneSeq v.2.4.0'; ##### configure for checking if the setup.sh script already run my $configure = 0; if ($configure == 0){ - die "\n\n$version\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; + die "\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; } ##### hostname my $hostname = `hostname`; @@ -173,9 +177,9 @@ my $outputfmt = 'blastxml'; my $eval_blast_query = 0.0001; my $filter = 'F'; # default for blastp -my $annotation_prog = "annoFAS"; -my $fas_prog = "calcFAS"; -my $fdogFAS_prog = "fdogFAS"; +my $annotation_prog = "fas.doAnno"; +my $fas_prog = "fas.run"; +my $fdogFAS_prog = "fas.runFdogFas"; ##### ublast Baustelle: not implemented yet my $runublast = 0; @@ -203,7 +207,6 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; -my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -260,6 +263,7 @@ my $blastNode; my $representative; my $core_rep; +my $checkOff; my $debug; my $corestrict; my $inputSeq = ""; @@ -307,15 +311,6 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; -#variables for fdog_goes_assembly -my $assembly; -my $augustusRefSpec; -my $avIntron; -my $lengthExtension; -my $assemblyPath; -my $searchTool = 'blast'; -my $matrix = 'blosum62'; -my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -365,6 +360,7 @@ "blastpath=s" => \$blastPath, "searchpath=s" => \$genome_dir, "weightpath=s" => \$weightPath, + "checkOff" => \$checkOff, "debug" => \$debug, "coreHitlimit=s" => \$core_hitlimit, "hitlimit=s" => \$hitlimit, @@ -377,15 +373,7 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa, - "assembly" => \$assembly, - "assemblypath=s" => \$assemblyPath, - "augustusRefSpec=s" => \$augustusRefSpec, - "avIntron=s" => \$avIntron, - "lengthExtension=s" => \$lengthExtension, - "searchTool=s" => \$searchTool, - "scoringmatrix=s" => \$matrix, - "dataPath=s" => \$dataPath + "searchTaxa=s" => \$searchTaxa ); $outputPath = abs_path($outputPath); @@ -397,17 +385,16 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; -$dataPath = abs_path($dataPath)."/"; -$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { print "Validity checking....\n"; my $checkStTime = gettime(); - initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); - print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; + unless($checkOff) { + initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); + } - if (!defined $coreex && !defined $assembly) { + if (!defined $coreex) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -420,6 +407,7 @@ die "ERROR: coreOrth not defined (must be integer)!"; } } + print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; } ############# show version @@ -490,7 +478,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support && !$assembly){ +if ($fas_support){ if (!$weightPath) { createWeightFolder(); } @@ -499,7 +487,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex && !$assembly) { +if (!$coreex) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -637,12 +625,7 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if (!$assembly){ - $taxaPath = $genome_dir; - } - else{ - $taxaPath = $assembly_dir; - } + $taxaPath = $genome_dir; my @searchTaxa; unless ($searchTaxa) { unless($groupNode) { @@ -698,72 +681,20 @@ } } } - if ($assembly){ - $eval_blast = sprintf("%f", $eval_blast); - if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); - - if (defined $assemblyPath){ - push(@assembly_cmd, "--assemblyPath $assemblyPath") - } - if (defined $avIntron){ - push(@assembly_cmd, "--avIntron $avIntron "); - } - if (defined $lengthExtension){ - push(@assembly_cmd, "--lengthExtension $lengthExtension "); - } - if (!$autoclean){ - push(@assembly_cmd, "--tmp "); - } - if ($outputPath){ - push(@assembly_cmd, "--out $outputPath "); - } - if (defined $strict){ - push(@assembly_cmd, "--strict"); - } - if ($eval_blast){ - push(@assembly_cmd, "--evalBlast $eval_blast "); - } - if ($searchTool){ - push(@assembly_cmd, "--msaTool $aln "); - } - if (defined $checkcoorthologsref){ - push(@assembly_cmd, "--checkCoorthologsRef"); - } - if ($searchTool){ - push(@assembly_cmd, "--searchTool $searchTool"); - } - if ($matrix){ - push(@assembly_cmd, "--scoringmatrix $matrix"); - } - if ($coreOrthologsPath){ - push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); - } - if ($fasoff){ - push(@assembly_cmd, "--fasoff"); - } - if ($searchTaxon){ - push(@assembly_cmd, "--searchTaxon $searchTaxon"); - } - if ($filter){ - push(@assembly_cmd, "--filter $filter"); - } - printDebug(@assembly_cmd); - system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; - } - } - else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); - } $pm->finish; } $pm->wait_all_children; } +### remove duplicated seq in extended.fa +if (-e $finalOutput) { + addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); +} push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -if(!$coreOnly && !$assembly){ +## Evaluation of all orthologs that are predicted by the final run +if(!$coreOnly){ my $fasStTime = gettime(); my $processID = $$; @@ -775,9 +706,9 @@ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support && !$assembly) { + if ($fas_support) { print "Starting the feature architecture similarity score computation...\n"; - my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu"; + my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; unless ($countercheck) { $fdogFAScmd .= " --bidirectional" } @@ -788,21 +719,12 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } -if ($assembly){ - my $file_assembly_out; - $file_assembly_out = $outputPath . '/' . $seqName; - my $cmd_merge; - $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; - printDebug($cmd_merge); - system($cmd_merge); -} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -814,7 +736,10 @@ push @logOUT, "fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; #### writing the log -open (LOGOUT, ">$outputPath/fdog.log") or warn "Failed to open fdog.log for writing"; +open (LOGOUT, ">>$outputPath/fdog.log") or die "Could not open $outputPath/fdog.log for writing\n"; +print LOGOUT "\n\n"; +my $fdogVersion = `fdog.run --version`; +print LOGOUT "fDOG v$fdogVersion\n"; print LOGOUT join "\n", @logOUT; close LOGOUT; exit; @@ -1209,10 +1134,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set if options reuseCore or assembly is used - if ($coreex || $assembly) { + ### check the presence of the pre-computed core set + if ($coreex) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1283,7 +1208,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and (!$coreex and !$assembly)) { + while(!$minCoreOrthologs and !$coreex) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1298,12 +1223,10 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - if (!$assembly){ - $inputSeq = fetchSequence($seqFile, $dataDir); - } + $inputSeq = fetchSequence($seqFile, $dataDir); ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '' && !$assembly) { + if($seqId eq '') { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1318,9 +1241,9 @@ sub checkOptions { $refSpec = $besthit->{species}; my $details = "Evalue: " . $besthit->{evalue}; printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); - if(length("$seqName|$refSpec|$seqId") > 60) { - die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; - } + # if(length("$seqName|$refSpec|$seqId") > 60) { + # die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; + # } if($seqId eq '') { print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; exit; @@ -1398,22 +1321,24 @@ sub checkOptions { mkdir $outputPath or die "could not re-create the output directory $outputPath\n"; } elsif ($append) { - printOut("Appending output to $finalOutput\n", 1); - if (-e "$outputPath/$seqName.extended.profile") { + if (-e "$outputPath/$seqName.extended.fa") { ## read in the content for latter appending - printOut("Appending output to $outputPath/$seqName.extended.profile", 1); - open (IN, "<$outputPath/$seqName.extended.profile") or die "failed to open $outputPath/$seqName.extended.profile after selection of option -append\n"; + printOut("Appending output to $outputPath/$seqName.extended.fa", 1); + open (IN, "<$outputPath/$seqName.extended.fa") or die "failed to open $outputPath/$seqName.extended.fa after selection of option -append\n"; while () { - chomp $_; - my @keys = split '\|', $_; - $profile{$keys[1]} = 1; + my $line = $_; + if ($line =~ /\|/) { + chomp $line; + my @keys = split '\|', $line; + $profile{$keys[1]} = 1; + } } } elsif ($fasoff) { ## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing } else { - printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output"); + printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output", 1); exit; } } @@ -1428,9 +1353,8 @@ sub checkOptions { #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex and !$assembly) { + if (!$coreex) { my $node; - #print "Testing coreex assembly\n"; $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ @@ -1790,8 +1714,9 @@ sub cumulativeAlnScore{ my $line = $_; $line =~ s/[\(\)]//g; my @line = split('\s+',$line); - - if($line[0] && ($line[0] eq $key)){ + my $shortedId = substr($key, 0, 60); + # if($line[0] && ($line[0] eq $key)){ + if($line[0] && ($line[0] eq $shortedId)){ if(exists $cumscores{$key}) { $gotScore = 1; $cumscores{$key} = $cumscores{$key} + $line[2]; @@ -2146,7 +2071,7 @@ sub addSeedSeq { # get seed sequence and add it to the beginning of the fasta output open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); - my %idTmp; # used to check which seq has already been written to output + my %idTmp = (); # used to check which seq has already been written to output while(my $seq = $seqio->next_seq) { my $id = $seq->id; if ($id =~ /$refSpec/) { @@ -2162,6 +2087,7 @@ sub addSeedSeq { unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { unless ($idTmp{$id}) { print TEMP ">$id\n", $seq->seq, "\n"; + $idTmp{$id} = 1; } } } @@ -2643,9 +2569,9 @@ sub initialCheck { } # check executable FAS - my $fasCheckMsg = `setupFAS -t ./ -c 2>&1`; + my $fasCheckMsg = `fas.setup -t ./ -c 2>&1`; if ($fasoff != 1 && $fasCheckMsg =~ /ERROR/) { - die "ERROR: greedyFAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/prepareFAS\n"; + die "ERROR: FAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/setup\n"; } # check seed fasta file @@ -2690,9 +2616,19 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1 && !$assembly) { + if ($fasoff != 1) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); + my @notFolder; + for (my $i = 0;$i < scalar(@allTaxa); $i++){ + if (-f "$blastDir/$allTaxa[$i]" || -f "$genomeDir/$allTaxa[$i]") { + push(@notFolder, $allTaxa[$i]); + splice(@allTaxa, $i, 1); + } + } + if (scalar(@notFolder) > 0) { + print "*** WARNING: Found files in $genomeDir or $blastDir:\t@notFolder\n"; + } chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); my @allAnno = split(/\n/, $allAnno); my @missingAnno = array_minus(@allTaxa, @allAnno); diff --git a/fdog/checkData.py b/fdog/checkData.py index 84310ac..3aafe44 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -133,28 +133,29 @@ def checkDataFolder(checkDir, replace, delete, concat): if os.path.islink(faFile): faFile = os.path.realpath(faFile) checkFileExist(faFile) - if not '.checked' in faFile: - if not os.path.exists(faFile+".checked"): - checkFaFile = checkValidFasta(faFile) - if checkFaFile == 'notFasta': - sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) - elif checkFaFile == 'longHeader': - sys.exit('*** ERROR: %s contains long headers!' % faFile) - elif checkFaFile == 'space': - sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) - elif checkFaFile == 'multiLine': - if not concat: - print('*** ERROR: %s contains multiple-line sequences!' % faFile) - sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') - else: - rewriteSeqs(faFile, replace, delete) - elif checkFaFile == 'ok': - if not (delete or replace): - checkValidSeqs(faFile) - else: - rewriteSeqs(faFile, replace, delete) - writeCheckedFile(faFile) - print(fd) + if not '.mapping' in faFile: + if not '.checked' in faFile: + if not os.path.exists(faFile+".checked"): + checkFaFile = checkValidFasta(faFile) + if checkFaFile == 'notFasta': + sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) + elif checkFaFile == 'longHeader': + sys.exit('*** ERROR: %s contains long headers!' % faFile) + elif checkFaFile == 'space': + sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) + elif checkFaFile == 'multiLine': + if not concat: + print('*** ERROR: %s contains multiple-line sequences!' % faFile) + sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') + else: + rewriteSeqs(faFile, replace, delete) + elif checkFaFile == 'ok': + if not (delete or replace): + checkValidSeqs(faFile) + else: + rewriteSeqs(faFile, replace, delete) + writeCheckedFile(faFile) + print(fd) taxaList.append(fd) except subprocess.CalledProcessError as e: print('*** ERROR: Problem while searching for fasta file') @@ -162,13 +163,28 @@ def checkDataFolder(checkDir, replace, delete, concat): sys.exit() return(taxaList) -def checkCompleteAnno(weightDir, taxaList): +def checkMissingJson(weightDir, taxaList): allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] taxaAnno = [s + '.json' for s in taxaList] s = set(allAnno) missingAnno = [x for x in taxaAnno if x not in s] return(missingAnno) +def checkCompleteAnno(weightDir, genomeDir): + allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] + for f in allAnno: + tax = f.replace('.json', '') + print('...check annotations for %s' % tax) + jf = '%s/%s.json' % (weightDir, tax) + gf = '%s/%s/%s.fa' % (genomeDir, tax, tax) + cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, weightDir) + try: + subprocess.call([cmd], shell = True) + except subprocess.CalledProcessError as e: + print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') + print(e.output.decode(sys.stdout.encoding)) + sys.exit() + def checkMissingNcbiID(namesDmp, taxaList): ncbiId = {} with open(namesDmp, 'r') as f: @@ -193,7 +209,7 @@ def checkMissingNcbiID(namesDmp, taxaList): return(missingTaxa.keys(), dupTaxa) def main(): - version = '0.0.3' + version = '0.0.6' parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') @@ -237,12 +253,13 @@ def main(): ### check weightDir print('=> Checking %s...' % weightDir) - missingAnno = checkCompleteAnno(weightDir, join2Lists(genomeTaxa, blastTaxa)) + missingAnno = checkMissingJson(weightDir, join2Lists(genomeTaxa, blastTaxa)) if len(missingAnno) > 0: - print('\033[92m*** WARNING: Annotations not found for:\033[0m') + print('\033[92m*** WARNING: Annotation files not found for:\033[0m') print(*missingAnno, sep = "\n") print('NOTE: You still can run fdog without FAS using the option "-fasoff"') caution = 1 + checkCompleteAnno(weightDir, genomeDir) ### check ncbi IDs print('=> Checking NCBI taxonomy IDs...') diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 46f83c0..12fcf6f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Hannah Muelbaier + +# Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog # searches on genome assemblies @@ -27,8 +28,30 @@ import yaml import subprocess import time -======= +import shutil +import multiprocessing as mp + ########################### functions ########################################## +def check_path(path): + if not os.path.exists(path): + print(path + " does not exist. Exciting ...") + sys.exit() + +def check_ref_sepc(species_list, fasta_file): + file = open(fasta_file, "r") + lines = file.readlines() + species_file = [] + + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.append(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + def load_config(config_file): with open(config_file, 'r') as stream: try: @@ -36,23 +59,27 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) -def starting_subprocess(cmd, mode): - if mode == 'debug': - result = subprocess.run(cmd, shell=True) - elif mode == 'silent': - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - elif mode == 'normal': - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + except subprocess.TimeoutExpired: + return 1 def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} number_regions = 0 insert_length = int(insert_length) + score_list = [] for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) - #print("test") - #print(locations) size_list = len(locations) j = 0 while j < size_list-1: @@ -62,6 +89,8 @@ def merge(blast_results, insert_length): #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -69,6 +98,8 @@ def merge(blast_results, insert_length): #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -76,6 +107,8 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 @@ -83,20 +116,24 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 i += 1 j += 1 + for entry in locations: + score_list.append(entry[6]) number_regions += len(locations) blast_results[key] = locations - return blast_results, number_regions + return blast_results, number_regions, score_list def parse_blast(line, blast_results, cutoff): - # format blast line: - # format dictionary: {node_name: [(,,evalue, ,,)]} + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} line = line.replace("\n", "") line_info = line.split("\t") evalue = float(line_info[3]) @@ -105,7 +142,7 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: @@ -119,14 +156,32 @@ def parse_blast(line, blast_results, cutoff): node_name = split[1] if node_name in blast_results: list = blast_results[node_name] - list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) blast_results[node_name] = list else: - blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] return blast_results, evalue -def candidate_regions(intron_length, cutoff_evalue, tmp_path): +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -142,10 +197,13 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) if blast_results == {}: + blast_file.close() return 0,0 else: - candidate_regions, number_regions = merge(blast_results, intron_length) - + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) + blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -187,7 +245,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID:" + name + " , continuing with next region") + print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -250,11 +308,20 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": os.system("muscle -quiet -in " + output_file + " -out " + aln_file) #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if not os.path.exists(aln_file): + print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + elif msaTool == "mafft-linsi": #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - distances = get_distance_biopython(aln_file, matrix) + try: + distances = get_distance_biopython(aln_file, matrix) + except ValueError: + print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" + distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] @@ -280,7 +347,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva try: id_ref = seedDic[fdog_ref_species] except KeyError: - print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -298,45 +365,46 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva id, gene, evalue = (line.replace("\n", "")).split("\t") gene_name = gene.split("|")[2] if gene_name != old_name: - print("candidate:%s"%(gene_name)) - print("blast-hit:%s"%(id)) + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" min = float(evalue) if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnothitting\n") + print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnot hitting\n") + print("\tnot hitting\n") if mode == "debug" else "" old_name = gene_name if orthologs == []: - print("No hit in the backward search, ...exciting") + #print("No hit in the backward search, ...exciting") return 0, seed else: @@ -361,12 +429,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species " + species + "\n") + print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: - print("The species " + species + " isn't part of the core ortholog group, ... exciting") + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -389,12 +457,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(species) #print(orthologs_new) + #print(orthologs) if species == fdog_ref_species: orthologs = orthologs_new else: orthologs = orthologs & orthologs_new - if orthologs == {}: - print("No ortholog was found with option --strict") + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") return 0, seed @@ -403,6 +472,39 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set(orthologs) return list(orthologs), seed +def addRef(output, core_fasta, species_list): + #print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + #print(item) + candidate_fasta = item[1] + sequenceIds = item[0] + if sequenceIds == 0 or sequenceIds == []: + continue + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): output_file = open(output, "a+") @@ -441,12 +543,18 @@ def createFasInput(orthologsOutFile, mappingFile): ncbi_id = (seq.id.split("@"))[1] mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") - + mappingFile.close() return fas_seed_id def cleanup(tmp, tmp_path): if tmp == False: - os.system('rm -r ' + tmp_path) + timeout = time.time() + 60*1 + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break + def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -517,6 +625,80 @@ def clean_fas(path, file_type): new_line = id + "\t" + remain file.write(new_line) + file.close() + +def ortholog_search(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + #orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + #mappingFile = out + "/tmp/" + group + ".mapping.txt" + + sys.stdout.write("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + db_check = searching_for_db(db_path) + + if db_check == 0: + #print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + #print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + #print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() + exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start + if exit_code == 1: + sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) + #cleanup(tmp, tmp_folder) + #sys.exit() + return [], candidatesOutFile + #else: + #print("\t ...finished") + print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + sys.stdout.write("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile + + else: + print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + #print("Starting augustus ppx ...") + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + #print("No genes found at candidate regions\n") + return [], candidatesOutFile + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + return reciprocal_sequences, candidatesOutFile class Logger(object): def __init__(self, file): @@ -534,24 +716,22 @@ def flush(self): def main(): - #################### handle user input ######################################## + #################### handle user input ##################################### start = time.time() - - version = '0.1.1' - - + version = '0.1.2' + ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) - + ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) - + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) + ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) @@ -564,15 +744,16 @@ def main(): optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') - optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') - optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) - - + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) args = parser.parse_args() # required @@ -602,15 +783,15 @@ def main(): msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa - if taxa == '': - taxa =[] - else: - taxa = taxa.split(",") fasoff = args.fasoff - searchTaxon = args.searchTaxon + searchTaxa = args.searchTaxa silent = args.silent debug = args.debug + force = args.force + append = args.append + parallel = args.parallel + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") return 1 @@ -637,23 +818,43 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' + + if out == '': + out = os.getcwd() + else: + if out[-1] != "/": + out = out + "/" + check_path(out) + + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + refBool = False + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + elif append == True: + out = out + '/' + group + '/' + refBool = True + else: + refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + refBool = False + if core_path == '': core_path = out + '/core_orthologs/' else: if not core_path.endswith('/'): core_path = core_path + '/' + check_path(core_path) if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' - if out == '': - #print('test out \n') - out = os.getcwd() - os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') - out = out + '/' + group + '/' - else: - if out[-1] != "/": - out = out + "/" - + check_path(assemblyDir) try: f = open(out + "/fdog.log", "a+") @@ -668,194 +869,130 @@ def main(): else: sys.stdout = Logger(f) - # user input has to be checked here before fDOGassembly continues - assembly_names = os.listdir(assemblyDir) - - ########################## some variables ################################## - - refBool = False # checks if sequences of reference species were already part of the extended.fa file + ########################### other variables ################################ + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa - ########### paths ########### + ################################# paths #################################### msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + tmp_folder = out + "/tmp" + + ########### is/are fDOG reference species part of ortholog group? ########## + + fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) ###################### create tmp folder ################################### cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') - ######################## consensus sequence ################################ + print("Gene: " + group) + print("fDOG reference species: " + fdog_ref_species + " \n") + ######################## consensus sequence ################################ + group_computation_time_start = time.time() #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence for gene " + group + " \n") + print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path starting_subprocess(cmd, mode) - print("consensus sequence is finished\n") + print("\t ...finished\n") ######################## block profile ##################################### - print("Building a block profile for gene " + group + " \n") + print("Building a block profile ...") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: - print("block profile is finished \n") + print("\t ...finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" - #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - #print(cmd) starting_subprocess(cmd, 'silent') - print("block profile is finished \n") - - searchBool = False - - #################### fDOG assembly computation for all species ############# - for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True - - ################### path definitions ################################### - - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "/tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("creating a blast data base \n") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("database is finished \n") - else: - print('blast data base exists already, continuing...') - - #makes a tBLASTn search against the new database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("tBLASTn search against data base") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - starting_subprocess(cmd, mode) - print("tBLASTn search is finished") - - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found") - if refBool == True: - continue - else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found. Extracting sequences...") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### - - print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") - - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - taxa = [fdog_ref_species] - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - - - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - - ################ add sequences to extended.fa in the output folder########## - - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True - - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - - - #if we searched in more than one Taxon and no ortholog was found + print(" \t ...finished \n") + + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + + ###################### ortholog search ##################################### + + ortholog_sequences = [] + time_ortholog_start = time.time() + if parallel == True: + ##################### parallel compuataion ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) + else: + ###################### computation species per species ################ + for asName in assembly_names: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + mappingFile = out + "/tmp/" + group + ".mapping.txt" - if refBool == False and searchTaxon == '': - print("No orthologs found. Exciting ...") - cleanup(tmp, tmp_path) - return 1 - #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False: fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") + tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") ################# remove tmp folder ######################################## - if searchTaxon != '': - cleanup(tmp, tmp_path) - else: - cleanup(tmp, out + "/tmp/") + end = time.time() + time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") + print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + sys.stdout = sys.__stdout__ end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() + cleanup(tmp, tmp_folder) if __name__ == '__main__': main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py deleted file mode 100644 index 1606b1d..0000000 --- a/fdog/mergeAssemblyOutput.py +++ /dev/null @@ -1,124 +0,0 @@ -# -*- coding: utf-8 -*- - -####################################################################### -# Copyright (C) 2020 Vinh Tran -# -# This script is used to merge all output files (.extended.fa, .phyloprofile, -# _forward.domains, _reverse.domains) in a given directory into one file each. -# -# This script is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for -# more details -# -# Contact: hannah.muelbaier@stud.uni-frankfurt.de -# -####################################################################### - -import sys -import os -from os import listdir as ldir -import argparse -from pathlib import Path - -def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', - action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) - parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) - args = parser.parse_args() - - directory = args.input - out = args.output - cleanup = args.cleanup - if not os.path.exists(os.path.abspath(directory)): - sys.exit('%s not found' % directory) - else: - directory = os.path.abspath(directory) - - phyloprofile = None - set_phylo = set() - domains_0 = None - set_domains_f = set() - domains_1 = None - set_domains_r = set() - ex_fasta = None - set_fasta = set() - header_bool = False - for infile in ldir(directory): - if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': - if not phyloprofile: - phyloprofile = open(out + '.phyloprofile', 'w') - phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: - phyloprofile.write(line) - if len(lines) > 1: - set_phylo = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': - if not domains_0: - domains_0 = open(out + '_forward.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_f: - domains_0.write(line) - if len(lines) > 1: - set_domains_f = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': - if not domains_1: - domains_1 = open(out + '_reverse.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_r: - domains_1.write(line) - if len(lines) > 1: - set_domains_r = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': - if not ex_fasta: - ex_fasta = open(out + '.extended.fa', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - header = set() - #print(set_fasta) - for line in lines: - if line[0] == ">": - header.add(line) - if line not in set_fasta: - ex_fasta.write(line) - header_bool = True - else: - header_bool = False - else: - if header_bool == True: - ex_fasta.write(line) - set_fasta = header - if cleanup == True: - os.remove(directory + '/' +infile) - elif infile.endswith('.tsv'): - os.remove(directory + '/' + infile) - - if phyloprofile: - phyloprofile.close() - if domains_0: - domains_0.close() - if domains_1: - domains_1.close() - if ex_fasta: - ex_fasta.close() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index 2628280..a6c13c2 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -20,14 +20,15 @@ import os from os import listdir as ldir import argparse -from pathlib import Path + def main(): version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + parser.add_argument('-i', '--input', + help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) + parser.add_argument('-o', '--output', help='Output name', action='store', default='', required=True) args = parser.parse_args() directory = args.input diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 0ea27eb..7b705ea 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -19,9 +19,9 @@ import os import argparse import subprocess -from pathlib import Path import shutil + def query_yes_no(question, default='yes'): valid = {'yes': True, 'y': True, 'ye': True, 'no': False, 'n': False} @@ -44,6 +44,7 @@ def query_yes_no(question, default='yes'): sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n') + def main(): version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.remove version ' + str(version) + '.') @@ -81,5 +82,6 @@ def main(): print('NOTE: fdog genome data are still available at %s.' % dataPath) + if __name__ == '__main__': main() diff --git a/fdog/runMulti.py b/fdog/runMulti.py index a696495..c19b0ff 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,7 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args + cpu, hyperthread, checkOff, debug, silent) = args mute = False if step == 'core': @@ -69,9 +69,8 @@ def prepare(args, step): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, True] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) + otherArgs = [cpu, hyperthread, checkOff, debug, True] + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -106,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) if len(coreCompilationJobs) > 0: pool = mp.Pool(cpu) coreOut = [] @@ -131,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -178,7 +176,7 @@ def joinOutputs(outpath, jobName, seeds, keep, silent): def calcFAS (outpath, extendedFa, weightpath, cpu): print('Starting calculating FAS scores...') start = time.time() - fasCmd = 'fdogFAS -i %s -w %s --cores %s' % (extendedFa, weightpath, cpu) + fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (extendedFa, weightpath, cpu) try: subprocess.call([fasCmd], shell = True) end = time.time() @@ -191,7 +189,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu): sys.exit('Problem running\n%s' % (fasCmd)) def main(): - version = '0.0.33' + version = '0.0.45' parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -289,17 +287,10 @@ def main(): choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) + optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -368,28 +359,20 @@ def main(): # others cpu = args.cpu hyperthread = args.hyperthread + checkOff = args.checkOff debug = args.debug silentOff = args.silentOff if silentOff == True: silent = False else: silent = True - - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix ### check fas if not fasoff: try: - fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True) + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) except: - sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') ### delete output folder and files if needed if forceComplete: @@ -403,7 +386,10 @@ def main(): outfiles = os.listdir(outpath) for item in outfiles: if item.startswith(jobName): - os.remove(os.path.join(outpath, item)) + try: + os.remove(os.path.join(outpath, item)) + except: + shutil.rmtree(outpath+'/'+item) if item.startswith("runtime"): os.remove(os.path.join(outpath, item)) if os.path.exists(outpath + '/missing.txt'): @@ -465,7 +451,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index a0ded09..c4abb82 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) + (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -153,36 +153,17 @@ def runSingle(args): if minScore > 0: cmd = cmd + ' -coreFilter=%s -minScore=%s' % (coreFilter, minScore) # add other options - (cpu, hyperthread, debug, silent) = otherArgs + (cpu, hyperthread, checkOff, debug, silent) = otherArgs cmd = cmd + ' -cpu=%s' % cpu if hyperthread == True: cmd = cmd + ' -hyperthread' + if checkOff == True: + cmd = cmd + ' -checkOff' if debug == True: cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # add assembly options - (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs - if assembly == True: - cmd = cmd + ' -assembly' - cmd = cmd + ' -reuseCore' - if not augustusRefSpec == '': - cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec - else: - sys.exit('An augutus reference species is requiered by using the option --assembly') - if not avIntron == '': - cmd = cmd + ' -avIntron=%s' % avIntron - if not lengthExtension == '': - cmd = cmd + ' -lengthExtension=%s' % lengthExtension - if not assemblyFile == '': - cmd = cmd + ' -assemblyFile=%s' % assemblyFile - if not searchTool == '': - cmd = cmd + ' -searchTool=%s' % searchTool - if not matrix == '': - cmd = cmd + ' -scoringmatrix=%s' % matrix - if not dataPath == '': - cmd = cmd + ' -dataPath=%s' % dataPath - #print(cmd) + # print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -191,7 +172,7 @@ def runSingle(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.33' + version = '0.0.45' parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -209,8 +190,6 @@ def main(): optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -289,17 +268,10 @@ def main(): choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) + optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -319,7 +291,6 @@ def main(): searchpath = args.searchpath weightpath = args.weightpath pathFile = args.pathFile - assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -367,6 +338,7 @@ def main(): # others cpu = args.cpu hyperthread = args.hyperthread + checkOff = args.checkOff debug = args.debug silentOff = args.silentOff if silentOff == True: @@ -374,15 +346,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### get fdog and data path dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') @@ -430,30 +393,19 @@ def main(): except: sys.exit('weightpath not found in %s' % pathFile) - if assemblypath == '': - assemblypath = dataPath + '/assembly_dir' - if dataPath == 'config': - try: - assemblypath = cfg['assemblypath'] - except: - sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath - ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, silent] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] + otherArgs = [cpu, hyperthread, checkOff, debug, silent] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) if __name__ == '__main__': main() diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index 2e8ff02..1eaf176 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -154,9 +154,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 1f74552..28eb851 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -114,14 +114,20 @@ echo "Downloading and installing annotation tools/databases:" fasta36="yes" if [ -z "$(which fasta36)" ]; then fasta36="no" - fasta36v="fasta-36.3.8h" + # fasta36v="fasta-36.3.8h" + fasta36v="36.3.8h_04-May-2020" if ! [ -f "bin/aligner/bin/fasta36" ]; then - echo "fasta-36" - wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" - tar xf $fasta36v.tar.gz - rm "${fasta36v}.tar.gz" - mv $fasta36v/* $CURRENT/bin/aligner/ - rm -rf $fasta36v + echo "fasta36" + # wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" + # tar xf $fasta36v.tar.gz + # rm "${fasta36v}.tar.gz" + # mv $fasta36v/* $CURRENT/bin/aligner/ + # rm -rf $fasta36v + wget "https://github.com/wrpearson/fasta36/archive/refs/tags/v${fasta36v}.tar.gz" + tar xf "v${fasta36v}.tar.gz" + rm "v${fasta36v}.tar.gz" + mv fasta36-${fasta36v}/* $CURRENT/bin/aligner/ + rm -rf "fasta36-${fasta36v}" cd "$CURRENT/bin/aligner/src" if [ $sys=="Linux" ]; then make -f ../make/Makefile.linux64_sse2 all @@ -162,10 +168,10 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then exit fi -fasPrepare=0 +setupFAS=0 if [ $fas == 1 ]; then cd "$CURRENT/bin" - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo "FAS" pip install --user greedyFAS if [ -z "$($grepprog \$HOME/.local/bin:\$PATH ~/$bashFile)" ]; then @@ -174,22 +180,22 @@ if [ $fas == 1 ]; then if [ -z "$($grepprog $homedir/.local/bin ~/$rprofile)" ]; then echo "Sys.setenv(PATH = paste(\"$homedir/.local/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile fi - fasPrepare=1 + setupFAS=1 else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi cd $CURRENT source ~/$bashFile - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mpip install greedyFAS\e[0m!" echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" exit else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi echo "done!" @@ -346,9 +352,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run @@ -409,9 +412,9 @@ else echo "-------------------------------------" $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$fasPrepare" == 1 ]; then + if [ "$setupFAS" == 1 ]; then echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mprepareFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" + echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" echo "Then you can test fdog with:" else echo "All tests succeeded, fdog should be ready to run. You can test it with:" diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index 7b4bd08..73b8573 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -163,9 +163,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run @@ -230,28 +227,28 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then fi cd "$CURRENT/bin" -fasPrepare=0 -if [ -z "$(which annoFAS)" ]; then +setupFAS=0 +if [ -z "$(which fas.doAnno)" ]; then echo "FAS" conda install -y -c BIONF fas - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo -e "\e[31mInstallation of FAS failed! Please try again!\e[0m" exit fi - fasPrepare=1 + setupFAS=1 else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi -if [ -z "$(which annoFAS)" ]; then +if [ -z "$(which fas.doAnno)" ]; then echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mconda install -c BIONF fas\e[0m or \e[91mpip install greedyFAS\e[0m" echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" exit else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi cd $CURRENT @@ -435,9 +432,9 @@ else echo "-------------------------------------" $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$fasPrepare" == 1 ]; then + if [ "$setupFAS" == 1 ]; then echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96msetupFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" + echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" echo "Then you can test fdog with:" else echo "All tests succeeded, fdog should be ready to run. You can test it with:" diff --git a/setup.py b/setup.py index 75573c1..b61e66b 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.0.33", + version="0.0.45", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", @@ -43,7 +43,7 @@ 'ete3', 'six', 'PyYAML', - 'greedyFAS>=1.5.0' + 'greedyFAS>=1.11.2' ], entry_points={ 'console_scripts': ["fdog.run = fdog.runSingle:main", From 6d7df01742ec284f9df85a4f38b5ae06a4bb1a89 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 11:34:59 +0200 Subject: [PATCH 132/192] updated help function --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e40701b..4733b4b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -769,7 +769,7 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - filter = args.filter + # if filter == True or filter == 'yes': filter = 'yes' else: From ac2652b3162e8fc6d7af94ed6bb0ccea0b10053d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 11:40:22 +0200 Subject: [PATCH 133/192] updated help function --- fdog/fDOGassembly.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 207b50f..27a36c2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- ####################################################################### - # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -555,7 +554,6 @@ def cleanup(tmp, tmp_path): print("tmp folder could not be removed!") break - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -719,6 +717,7 @@ def main(): #################### handle user input ##################################### start = time.time() + version = '0.1.2' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -770,11 +769,11 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - # - if filter == True or filter == 'yes': - filter = 'yes' - else: - filter = 'no' + #filter = args.filter + #if filter == True or filter == 'yes': + #filter = 'yes' + #else: + #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -972,7 +971,6 @@ def main(): if fasoff == False: fas = time.time() print("Calculating FAS scores ...") - tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group @@ -988,9 +986,6 @@ def main(): print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - end = time.time() - sys.stdout = sys.__stdout__ - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() cleanup(tmp, tmp_folder) From 688b21e79318679690e1d88bc0e242c169be4da6 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 19 Oct 2021 11:52:46 +0200 Subject: [PATCH 134/192] rm filter option --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 12fcf6f..f7f5e05 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -770,11 +770,11 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - filter = args.filter - if filter == True or filter == 'yes': - filter = 'yes' - else: - filter = 'no' + #filter = args.filter + #if filter == True or filter == 'yes': + #filter = 'yes' + #else: + #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension From 075616852382405d6c922fde0677fdc210ca37fc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 14:32:47 +0200 Subject: [PATCH 135/192] error handling of ValueError in function get_distance_biopython --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 27a36c2..d216048 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,7 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - except ValueError: + except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From f9d4623faa9817bb3f56672c29cf40df47110bce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:36:51 +0200 Subject: [PATCH 136/192] test --- fdog/fDOGassembly.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d216048..adc48b2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,6 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + print(distances) except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" @@ -658,6 +659,7 @@ def ortholog_search(args): sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) #sys.exit() + sys.stdout.flush() return [], candidatesOutFile #else: #print("\t ...finished") @@ -667,6 +669,7 @@ def ortholog_search(args): if regions == 0: #no candidat region are available, no ortholog can be found sys.stdout.write("No candidate region found for species %s!\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: @@ -685,6 +688,7 @@ def ortholog_search(args): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") + sys.stdout.flush() return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -692,10 +696,12 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + sys.stdout.flush() return reciprocal_sequences, candidatesOutFile class Logger(object): From 134f94d830803c708b989d339201501ecad8ab39 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:46:02 +0200 Subject: [PATCH 137/192] test --- fdog/fDOGassembly.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index adc48b2..d6877e2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,14 +317,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - print(distances) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" - distance_hit_query = distances[best_hit, candidate_name] - distance_ref_hit = distances[best_hit, ref] + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] if distance_ref_hit < distance_hit_query: #accepted From 81af9add957ca8ec3eb0257a8d9d0b2e452ab2e9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:50:21 +0200 Subject: [PATCH 138/192] test --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d6877e2..111baf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -320,7 +320,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) - except get_distance_biopython.ValueError: + except ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From 1c54841813a862987790ef7940d40dccbc8a9642 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:51:25 +0200 Subject: [PATCH 139/192] test --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 111baf7..4bd9938 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,6 +317,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + print(distances) + print(best_hit) + print(candidate_name) distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) From 8eb12a52ca85a97a1174028ba0c9018a70459dba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:58:14 +0200 Subject: [PATCH 140/192] fixed item not found error in distance function --- fdog/fDOGassembly.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4bd9938..111baf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,9 +317,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - print(distances) - print(best_hit) - print(candidate_name) distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) From 326ff4259b578d479f980914d1be0bc95d8290b7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 20 Oct 2021 10:25:11 +0200 Subject: [PATCH 141/192] cleaning up output --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 111baf7..36db8a3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -244,7 +244,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -321,7 +322,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: - print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From 97750b6f1fd010dc5998a7e1636a0663f7bfdcd8 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 11:39:36 +0200 Subject: [PATCH 142/192] Fdog goes assembly (#13) --- fdog/fDOGassembly.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f7f5e05..8aeec9b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -2,6 +2,7 @@ ####################################################################### + # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -245,7 +246,9 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + output.close() def searching_for_db(assembly_path): @@ -318,13 +321,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) except ValueError: - print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) - return 0, "NaN", "NaN" + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) - distance_hit_query = distances[best_hit, candidate_name] - distance_ref_hit = distances[best_hit, ref] + + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] if distance_ref_hit < distance_hit_query: #accepted @@ -660,6 +666,8 @@ def ortholog_search(args): sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) #sys.exit() + sys.stdout.flush() + return [], candidatesOutFile #else: #print("\t ...finished") @@ -669,6 +677,8 @@ def ortholog_search(args): if regions == 0: #no candidat region are available, no ortholog can be found sys.stdout.write("No candidate region found for species %s!\n" % asName) + sys.stdout.flush() + return [], candidatesOutFile else: @@ -687,6 +697,7 @@ def ortholog_search(args): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") + sys.stdout.flush() return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -694,10 +705,12 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + sys.stdout.flush() return reciprocal_sequences, candidatesOutFile class Logger(object): @@ -988,9 +1001,6 @@ def main(): print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - end = time.time() - sys.stdout = sys.__stdout__ - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() cleanup(tmp, tmp_folder) From a7f9e19097922f3c69921c4ed17199ae1ba83bc8 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 12:04:27 +0200 Subject: [PATCH 143/192] bug fix in function checkCoOrthologs --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8aeec9b..9b745db 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -325,6 +325,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: + pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) From 7b8745b8d1da86606a51d779580a68009927f91c Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 12:20:45 +0200 Subject: [PATCH 144/192] bug fix --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9b745db..10f7aeb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -327,6 +327,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates except ValueError: pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" From c21a3f5b6ffe29c5beeb21b6a992dea15a4d02f7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:28:00 +0200 Subject: [PATCH 145/192] enabled output during parallel computation --- fdog/fDOGassembly.py | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 36db8a3..760e6d0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -631,6 +631,7 @@ def clean_fas(path, file_type): def ortholog_search(args): (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args + output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') tmp_path = out + "tmp/" + asName + "/" @@ -639,7 +640,7 @@ def ortholog_search(args): fasOutFile = out + "/" + group #mappingFile = out + "/tmp/" + group + ".mapping.txt" - sys.stdout.write("Searching in species " + asName + "\n") + output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" db_check = searching_for_db(db_path) @@ -659,24 +660,20 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) - #cleanup(tmp, tmp_folder) - #sys.exit() - sys.stdout.flush() - return [], candidatesOutFile + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output #else: #print("\t ...finished") - print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found - sys.stdout.write("No candidate region found for species %s!\n" % asName) - sys.stdout.flush() - return [], candidatesOutFile + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output else: - print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### @@ -686,26 +683,23 @@ def ortholog_search(args): #print("\t ...finished \n") time_augustus_end = time.time() time_augustus = time_augustus_end - time_augustus_start - print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") - sys.stdout.flush() - return [], candidatesOutFile + return [], candidatesOutFile, output reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) if reciprocal_sequences == 0: if regions != 0: - sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) - sys.stdout.flush() - return [], candidatesOutFile + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile, output else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - sys.stdout.flush() - return reciprocal_sequences, candidatesOutFile + return reciprocal_sequences, candidatesOutFile, output class Logger(object): def __init__(self, file): @@ -956,12 +950,14 @@ def main(): pool.close() pool.join() for i in results: - ortholog_sequences.append(i) + print(i[2]) + ortholog_sequences.append(i[0], i[1]) else: ###################### computation species per species ################ for asName in assembly_names: args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile = ortholog_search(args) + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) + print(output_ortholog_search) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) ################## preparing output ######################################## From d4374231dd228c97dd42f771f6d9b462faf2eb47 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:30:49 +0200 Subject: [PATCH 146/192] enabled output during parallel computation --- fdog/fDOGassembly.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 08de346..42ddf69 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -664,17 +664,8 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: -<<<<<<< HEAD output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output -======= - sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) - #cleanup(tmp, tmp_folder) - #sys.exit() - sys.stdout.flush() - - return [], candidatesOutFile ->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1 #else: #print("\t ...finished") output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) @@ -682,15 +673,8 @@ def ortholog_search(args): regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found -<<<<<<< HEAD output.append("No candidate region found for species %s!\n" % asName) return [], candidatesOutFile, output -======= - sys.stdout.write("No candidate region found for species %s!\n" % asName) - sys.stdout.flush() - - return [], candidatesOutFile ->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1 else: output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) From 7a37abc0a5109147779704517eddef55135a10ba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:44:35 +0200 Subject: [PATCH 147/192] bug fix --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 42ddf69..6464384 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -325,7 +325,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: + pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" From 02f004671375ebf02c9bc0a607723f6409a9150f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:56:50 +0200 Subject: [PATCH 148/192] improved output --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6464384..56de5f1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -955,15 +955,17 @@ def main(): pool.close() pool.join() for i in results: - print(i[2]) ortholog_sequences.append(i[0], i[1]) + for k in i[2]: + print(k) else: ###################### computation species per species ################ for asName in assembly_names: args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) - print(output_ortholog_search) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + for k in output_ortholog_search: + print(k) ################## preparing output ######################################## orthologsOutFile = out + "/" + group + ".extended.fa" From 52feba3fdc5d50a9d2f14953297fad5381091531 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 12:09:58 +0200 Subject: [PATCH 149/192] improved output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 56de5f1..ad10cc8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -955,7 +955,7 @@ def main(): pool.close() pool.join() for i in results: - ortholog_sequences.append(i[0], i[1]) + ortholog_sequences.append([i[0], i[1]]) for k in i[2]: print(k) else: From 9c228b2865d1682f2040250f5f4107f11c8d11c4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 25 Oct 2021 13:23:43 +0200 Subject: [PATCH 150/192] a file can be used as input for --searchTaxa --- fdog/fDOGassembly.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ad10cc8..dbd49e0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -880,12 +880,24 @@ def main(): if searchTaxa == []: assembly_names = os.listdir(assemblyDir) else: - assembly_names = os.listdir(assemblyDir) - for Taxon in searchTaxa: - if Taxon not in assembly_names: - print("Taxon %s is not in the assembly_dir" % Taxon) - sys.exit() - assembly_names = searchTaxa + if len(searchTaxa) > 1: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa + else: + if searchTaxa[0] in assembly_names: + assembly_names = searchTaxa + elif os.path.isfile(searchTaxa[0]): + with open(searchTaxa[0]) as file: + lines = file.readlines() + assembly_names = [line.rstrip() for line in lines] + else: + print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) + + ################################# paths #################################### From fdb30730476e611d74e0ed8d527ef8711821a7d9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 25 Oct 2021 13:39:19 +0200 Subject: [PATCH 151/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index dee4ab4..fc510c4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -889,7 +889,7 @@ def main(): sys.exit() assembly_names = searchTaxa else: - if searchTaxa[0] in assembly_names: + if searchTaxa[0] in os.listdir(assemblyDir): assembly_names = searchTaxa elif os.path.isfile(searchTaxa[0]): with open(searchTaxa[0]) as file: From f43820e9fc66ec930e89e50ffeba679d5b9f43cd Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 16:55:41 +0200 Subject: [PATCH 152/192] fixed bug in searching_for_db --- fdog/fDOGassembly.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index fc510c4..1b44ea9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -257,6 +257,11 @@ def searching_for_db(assembly_path): check = True for end in db_endings: check = check and os.path.exists(assembly_path + end) + + if check == False: + check = True + for end in db_endings: + check = check and os.path.exists(assembly_path + '.00.' + end) return check def get_distance_biopython(file, matrix): @@ -563,7 +568,6 @@ def cleanup(tmp, tmp_path): print("tmp folder could not be removed!") break - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names From 7d12ffa28c25f2115ad6005e9d4bf7071508023c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:10:06 +0200 Subject: [PATCH 153/192] fixed bug in function searching_for_db --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b44ea9..fdb90fa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -261,7 +261,7 @@ def searching_for_db(assembly_path): if check == False: check = True for end in db_endings: - check = check and os.path.exists(assembly_path + '.00.' + end) + check = check and os.path.exists(assembly_path + '.00' + end) return check def get_distance_biopython(file, matrix): From 110073f4e00da8795dd43cbec28ac12b9d90b4f4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:22:32 +0200 Subject: [PATCH 154/192] bug fix searching_for_db function --- fdog/fDOGassembly.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index fdb90fa..2b9e6fb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -256,12 +256,8 @@ def searching_for_db(assembly_path): db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] check = True for end in db_endings: - check = check and os.path.exists(assembly_path + end) - - if check == False: - check = True - for end in db_endings: - check = check and os.path.exists(assembly_path + '.00' + end) + if not any(File.endswith(end) for File in os.listdir(assembly_path)): + check = False return check def get_distance_biopython(file, matrix): From afd28c60bf071f9d2943b6ebcb18ee2c4dcd0c09 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:28:05 +0200 Subject: [PATCH 155/192] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2b9e6fb..b92cefc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -649,7 +649,8 @@ def ortholog_search(args): output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - db_check = searching_for_db(db_path) + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + db_check = searching_for_db(blast_dir_path) if db_check == 0: #print("Creating a blast data base...") From 6076c5da9bf5f4abfca1a724dc142b763c46e674 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:29:55 +0200 Subject: [PATCH 156/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b92cefc..d220039 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -649,7 +649,7 @@ def ortholog_search(args): output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" db_check = searching_for_db(blast_dir_path) if db_check == 0: From 2f38455330bef1e8b63a2d8ab0c1aed375c7c479 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 9 Feb 2022 14:38:49 +0100 Subject: [PATCH 157/192] reorganization of code to enable the use of metaeuk as an alternative to Augustus --- fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/fDOGassembly.py | 145 ++++++++++++++++++++++++++----------------- 2 files changed, 89 insertions(+), 56 deletions(-) diff --git a/fdog/.DS_Store b/fdog/.DS_Store index 34e42555d35fd3e0f289e49c57c3fa62ffc1f870..a99a01c231b8aab3b888fe9e4dacf4b66808b3f0 100644 GIT binary patch delta 40 wcmZp1XmOa}&nU7nU^hRb$YvgaaOTbHg(FxdHu!92m-xoA*;8~M)5Hc(01vhes{jB1 delta 69 zcmZp1XmOa}&nUVvU^hRb=w=>)aAs*ShFpeJh9ZV^AnC|Z41}pbktBv3hRVr#!U{~x V&YMGo(^)pNOMGM5yitUm8312<5m*2K diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d220039..8a9af97 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- ####################################################################### - - # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -635,8 +633,8 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args +def ortholog_search_tblastn(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') @@ -670,8 +668,6 @@ def ortholog_search(args): output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output - #else: - #print("\t ...finished") output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) @@ -684,14 +680,18 @@ def ortholog_search(args): output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) - ############### make Augustus PPX search ################################### - #print("Starting augustus ppx ...") - time_augustus_start = time.time() - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - #print("\t ...finished \n") - time_augustus_end = time.time() - time_augustus = time_augustus_end - time_augustus_start - output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + + if gene_prediction == "augustus": + ############### make Augustus PPX search ################################### + #print("Starting augustus ppx ...") + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + else: + print("test") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -709,6 +709,48 @@ def ortholog_search(args): return reciprocal_sequences, candidatesOutFile, output +def blockProfiles(core_path, group, mode): + + ######################## paths ################################ + msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) + profile_path = out + "/tmp/" + group + ".prfl" + + ######################## block profile ##################################### + + print("Building a block profile ...") + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("\t ...finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + print(" \t ...finished \n") + + return profile_path + +def consensusSequence(core_path, group, mode): + + ######################## paths ################################ + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) + consensus_path = out + "/tmp/" + group + ".con" + + ######################## consensus sequence ################################ + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("\t ...finished\n") + + return consensus_path + class Logger(object): def __init__(self, file): self.file = file @@ -722,7 +764,6 @@ def write(self, message): def flush(self): pass - def main(): #################### handle user input ##################################### @@ -736,7 +777,6 @@ def main(): required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) - required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') @@ -763,11 +803,12 @@ def main(): optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) + optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) + optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') args = parser.parse_args() # required group = args.gene - augustus_ref_species = args.augustusRefSpec fdog_ref_species = args.refSpec #paths user input assemblyDir = args.assemblyPath @@ -800,6 +841,18 @@ def main(): append = args.append parallel = args.parallel + #gene prediction tool + augustus = args.augustus + if augutus == True: + augustus_ref_species = args.augustusRefSpec + if augustus_ref_species == '': + print("Augustus reference species is required when using Augustus as gene prediction tool") + return 1 + gene_prediction = "augustus" + else: + gene_prediction = "metaeuk" + + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") @@ -903,14 +956,8 @@ def main(): ################################# paths #################################### - msa_path = core_path + "/" + group +"/"+ group + ".aln" - check_path(msa_path) - hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" - check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" check_path(fasta_path) - consensus_path = out + "/tmp/" + group + ".con" - profile_path = out + "/tmp/" + group + ".prfl" tmp_folder = out + "/tmp" ########### is/are fDOG reference species part of ortholog group? ########## @@ -925,47 +972,30 @@ def main(): print("Gene: " + group) print("fDOG reference species: " + fdog_ref_species + " \n") - ######################## consensus sequence ################################ - group_computation_time_start = time.time() - #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence") - cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path - starting_subprocess(cmd, mode) - print("\t ...finished\n") + ###################### preparations ######################################## - ######################## block profile ##################################### + if augustus == True: + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode) + profile_path = blockProfiles(core_path, group, mode) + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start - print("Building a block profile ...") - cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, 'silent') - - if int(os.path.getsize(profile_path)) > 0: - print("\t ...finished \n") - else: - print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + group +"/"+ group + "_new.aln" - cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - starting_subprocess(cmd, mode) - cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, 'silent') - print(" \t ...finished \n") - - group_computation_time_end = time.time() - time_group = group_computation_time_end - group_computation_time_start ###################### ortholog search ##################################### ortholog_sequences = [] time_ortholog_start = time.time() + if parallel == True: - ##################### parallel compuataion ############################# + ##################### parallel computation ############################# calls = [] cpus = mp.cpu_count() pool = mp.Pool(cpus) for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]) - results = (pool.imap_unordered(ortholog_search, calls)) + results = (pool.imap_unordered(ortholog_search_tblastn, calls)) pool.close() pool.join() for i in results: @@ -973,18 +1003,20 @@ def main(): for k in i[2]: print(k) else: - ###################### computation species per species ################ + ###################### computation species wise ################ for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction] + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) for k in output_ortholog_search: print(k) - ################## preparing output ######################################## - orthologsOutFile = out + "/" + group + ".extended.fa" time_ortholog_end = time.time() time_ortholog = time_ortholog_end - time_ortholog_start + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + if taxa == []: taxa = [fdog_ref_species] if append == True: @@ -1006,6 +1038,7 @@ def main(): clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') print("\t ...finished \n") + ################# remove tmp folder ######################################## end = time.time() time_fas = end - fas From e088dff0ac04bd6fd5aa27aedf38af1502eda834 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 9 Feb 2022 16:05:30 +0100 Subject: [PATCH 158/192] included metaeuk --- fdog/fDOGassembly.py | 74 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8a9af97..f12e9cc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -212,6 +212,20 @@ def extract_seq(region_dic, path, tmp_path, mode): cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" starting_subprocess(cmd, mode) +def extract_sequence_from_to(name, file, start, end): + out = name + ".fasta" + if start < 0: + start = 0 + with open(out,"w") as f: + for seq_record in SeqIO.parse(file, "fasta"): + f.write(str(seq_record.id) + "\n") + sequence_length = len(seq_record.seq) + if end > sequence_length: + end = sequence_length + f.write(str(seq_record.seq[start:end]) + "\n") + + return out, start, end + def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): output = open(candidatesOutFile, "w") @@ -246,9 +260,43 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug except FileNotFoundError: pass #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") - output.close() +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group): + output = open(candidatesOutFile, "w") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + #some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + #metaeuk call + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + print(cmd) + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".fas", "r") + lines = sequence_file.readlines() + id = 0 + for line in lines: + if line[0] == ">": + id += 1 + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + pass + + output.close() + def searching_for_db(assembly_path): db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] @@ -473,8 +521,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print("No ortholog was found with option --strict") return 0, seed - - #print(orthologs) orthologs = set(orthologs) return list(orthologs), seed @@ -651,14 +697,11 @@ def ortholog_search_tblastn(args): db_check = searching_for_db(blast_dir_path) if db_check == 0: - #print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - #print("\t ...finished \n") #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' time_tblastn_start = time.time() exit_code = starting_subprocess(cmd, mode, 3600) @@ -683,15 +726,17 @@ def ortholog_search_tblastn(args): if gene_prediction == "augustus": ############### make Augustus PPX search ################################### - #print("Starting augustus ppx ...") time_augustus_start = time.time() augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - #print("\t ...finished \n") time_augustus_end = time.time() time_augustus = time_augustus_end - time_augustus_start output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: - print("test") + time_metaeuk_start = time.time() + metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + time_metaeuk_end = time.time() + time_metaeuk = time_metaeuk_end - time_metaeuk_start + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -820,11 +865,6 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - #filter = args.filter - #if filter == True or filter == 'yes': - #filter = 'yes' - #else: - #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -852,7 +892,6 @@ def main(): else: gene_prediction = "metaeuk" - # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") @@ -952,8 +991,6 @@ def main(): else: print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) - - ################################# paths #################################### fasta_path = core_path + "/" + group +"/"+ group + ".fa" @@ -980,6 +1017,9 @@ def main(): profile_path = blockProfiles(core_path, group, mode) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start + else: + print("test") + #concatinade core_group sequences if metaeuk should be run without tblastn ###################### ortholog search ##################################### From 5cb0f2bba80f33ae3b35861b7891a14ff6ae34ce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:26:37 +0100 Subject: [PATCH 159/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f12e9cc..f891b47 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -736,7 +736,7 @@ def ortholog_search_tblastn(args): metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start - output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))") + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: From cb085c71af0bda7eb2f7907f0c6a01fa4719f00d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:31:14 +0100 Subject: [PATCH 160/192] bug fix --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f891b47..64192b1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -814,7 +814,7 @@ def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.2' + version = '0.1.3' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -883,7 +883,7 @@ def main(): #gene prediction tool augustus = args.augustus - if augutus == True: + if augustus == True: augustus_ref_species = args.augustusRefSpec if augustus_ref_species == '': print("Augustus reference species is required when using Augustus as gene prediction tool") From 0d2d26db84d471960cf9e61e18d7721befce253c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:36:11 +0100 Subject: [PATCH 161/192] bug fix --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 64192b1..f68c3aa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -754,7 +754,7 @@ def ortholog_search_tblastn(args): return reciprocal_sequences, candidatesOutFile, output -def blockProfiles(core_path, group, mode): +def blockProfiles(core_path, group, mode, out): ######################## paths ################################ msa_path = core_path + "/" + group +"/"+ group + ".aln" @@ -780,7 +780,7 @@ def blockProfiles(core_path, group, mode): return profile_path -def consensusSequence(core_path, group, mode): +def consensusSequence(core_path, group, mode, out): ######################## paths ################################ hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" @@ -1013,8 +1013,8 @@ def main(): if augustus == True: group_computation_time_start = time.time() - consensus_path = consensusSequence(core_path, group, mode) - profile_path = blockProfiles(core_path, group, mode) + consensus_path = consensusSequence(core_path, group, mode, out) + profile_path = blockProfiles(core_path, group, mode, out) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: From 8d9ce6015e2b3a395d546b1f0033e918f0e3e1d2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:43:01 +0100 Subject: [PATCH 162/192] added preparation steps for metaeuk (tblastn search preparation) --- fdog/fDOGassembly.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f68c3aa..aa037e8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1019,7 +1019,11 @@ def main(): time_group = group_computation_time_end - group_computation_time_start else: print("test") + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode, out) #concatinade core_group sequences if metaeuk should be run without tblastn + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start ###################### ortholog search ##################################### From 65c8835fd080a227dc19f0f51dad39668e114130 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:45:28 +0100 Subject: [PATCH 163/192] bug fix --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index aa037e8..c82e8fb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -880,11 +880,12 @@ def main(): force = args.force append = args.append parallel = args.parallel + augustus_ref_species = args.augustusRefSpec #gene prediction tool augustus = args.augustus if augustus == True: - augustus_ref_species = args.augustusRefSpec + if augustus_ref_species == '': print("Augustus reference species is required when using Augustus as gene prediction tool") return 1 From 83275925f7e71b0d8b0609b79b89216a46b3084d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:47:26 +0100 Subject: [PATCH 164/192] bug fix --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c82e8fb..8bbfeba 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1020,6 +1020,7 @@ def main(): time_group = group_computation_time_end - group_computation_time_start else: print("test") + profile_path = "" group_computation_time_start = time.time() consensus_path = consensusSequence(core_path, group, mode, out) #concatinade core_group sequences if metaeuk should be run without tblastn From fb62700935cb87d4d03b32ca0ecc36346ee02037 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:48:47 +0100 Subject: [PATCH 165/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8bbfeba..11a8504 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -733,7 +733,7 @@ def ortholog_search_tblastn(args): output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: time_metaeuk_start = time.time() - metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) From f5e25dbc5fb65596bd65312a3e6d6feb83529653 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:51:55 +0100 Subject: [PATCH 166/192] bug fix --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 11a8504..54294d4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -214,15 +214,15 @@ def extract_seq(region_dic, path, tmp_path, mode): def extract_sequence_from_to(name, file, start, end): out = name + ".fasta" - if start < 0: + if int(start) < 0: start = 0 with open(out,"w") as f: for seq_record in SeqIO.parse(file, "fasta"): f.write(str(seq_record.id) + "\n") sequence_length = len(seq_record.seq) - if end > sequence_length: + if int(end) > sequence_length: end = sequence_length - f.write(str(seq_record.seq[start:end]) + "\n") + f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From e59ae539a7e4a679058c1d2535aa53809b9ccb5e Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:54:08 +0100 Subject: [PATCH 167/192] bug fix --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 54294d4..990bbd0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -213,6 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode): starting_subprocess(cmd, mode) def extract_sequence_from_to(name, file, start, end): + print(name) out = name + ".fasta" if int(start) < 0: start = 0 @@ -222,6 +223,8 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length + print(start) + print(end) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From 188ae4b8a54866978b984335042e74b0d0b9ecc3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:58:05 +0100 Subject: [PATCH 168/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 990bbd0..233d8f5 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -219,7 +219,7 @@ def extract_sequence_from_to(name, file, start, end): start = 0 with open(out,"w") as f: for seq_record in SeqIO.parse(file, "fasta"): - f.write(str(seq_record.id) + "\n") + f.write(">" + str(seq_record.id) + "\n") sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length From 93e79fea116a8387aa8d5df5b08b7b143ada2078 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 12:03:07 +0100 Subject: [PATCH 169/192] bug fix --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 233d8f5..11091da 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -213,7 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode): starting_subprocess(cmd, mode) def extract_sequence_from_to(name, file, start, end): - print(name) + #print(name) out = name + ".fasta" if int(start) < 0: start = 0 @@ -223,8 +223,8 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length - print(start) - print(end) + #print(start) + #print(end) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" - print(cmd) + #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: @@ -290,7 +290,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group for line in lines: if line[0] == ">": id += 1 - header = ">" + group + "|" + ass_name + "|" + name + "_" + id + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) output.write(header) else: output.write(line) From 90eb408d967041e1d3f1960c8ebfe2745853d1ed Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:04:42 +0100 Subject: [PATCH 170/192] testing other paramteres for metaeuk --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 11091da..25b4a6c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -279,7 +279,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences From ca4133aa4ab7389d8c4827d8ebc6702988609e26 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:20:27 +0100 Subject: [PATCH 171/192] testing new parameters --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 25b4a6c..f35c80c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -286,6 +286,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group try: sequence_file = open(tmp_path + name + ".fas", "r") lines = sequence_file.readlines() + print(lines) id = 0 for line in lines: if line[0] == ">": From 6be72527e89676e3f1a89ffb8db492771d198307 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:30:19 +0100 Subject: [PATCH 172/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f35c80c..a8995fa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -299,7 +299,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group except FileNotFoundError: pass - output.close() + output.close() def searching_for_db(assembly_path): From 926963f369aeebc3bfeb5160574961061da90777 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:44:33 +0100 Subject: [PATCH 173/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a8995fa..0836198 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -291,7 +291,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group for line in lines: if line[0] == ">": id += 1 - header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n" output.write(header) else: output.write(line) From 062eefcc7fc94bba111c1c1e977d2fd8a3f4caec Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:44:38 +0100 Subject: [PATCH 174/192] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0836198..48a6f85 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" - #print(cmd) + print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: From 49c080e1b76bb65e89268ba46a52dc86d06e4ffc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:54:56 +0100 Subject: [PATCH 175/192] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 48a6f85..ebca99e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -223,8 +223,9 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length - #print(start) - #print(end) + #for testing only + start = 0 + end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From fb8e97aff28edb0d827ccef10890a8997e9ec1b0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 14:06:25 +0100 Subject: [PATCH 176/192] testing --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ebca99e..d22b281 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - start = 0 - end = len(seq_record.seq) + #start = 0 + #end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end @@ -281,13 +281,13 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" - print(cmd) + #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".fas", "r") lines = sequence_file.readlines() - print(lines) + #print(lines) id = 0 for line in lines: if line[0] == ">": From be1b56a32c98610b5f8360fd20f1f777e8875b1f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 14:42:37 +0100 Subject: [PATCH 177/192] metaeuk is incldued and running in fdog_assembly workflow --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d22b281..40c63f8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,8 +280,9 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" #print(cmd) + # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) # parsing header and sequences try: @@ -1024,7 +1025,7 @@ def main(): group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: - print("test") + #print("test") profile_path = "" group_computation_time_start = time.time() consensus_path = consensusSequence(core_path, group, mode, out) From cb9a5fd6c0e23f6907dd8a056bc2fe1dc2736d96 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:07:16 +0100 Subject: [PATCH 178/192] testing other metaeuk parameters --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 40c63f8..edaaffe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) From 79791e8f52c95ea2e2e62d228081225508eca07f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:20:37 +0100 Subject: [PATCH 179/192] using complete contigs for metaeuk --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index edaaffe..c837c33 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - #start = 0 - #end = len(seq_record.seq) + start = 0 + end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From f6f72f7e0a5b3628045449afc9a350a542e1c339 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:34:36 +0100 Subject: [PATCH 180/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c837c33..edaaffe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - start = 0 - end = len(seq_record.seq) + #start = 0 + #end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From 61a1ee54036074d2d3079766dae26a1bd1a2b300 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 11 Feb 2022 11:41:27 +0100 Subject: [PATCH 181/192] added parameter for own metaeuk db --- fdog/fDOGassembly.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index edaaffe..20b74e3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -266,7 +266,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() -def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group): +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") for key in regions: @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) @@ -686,7 +686,7 @@ def clean_fas(path, file_type): file.close() def ortholog_search_tblastn(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') @@ -739,7 +739,11 @@ def ortholog_search_tblastn(args): output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: time_metaeuk_start = time.time() - metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + if metaeuk_db == '': + db = fasta_path + else: + db = metaeuk_db + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) @@ -856,6 +860,7 @@ def main(): optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='') args = parser.parse_args() # required @@ -887,6 +892,7 @@ def main(): append = args.append parallel = args.parallel augustus_ref_species = args.augustusRefSpec + metaeuk_db = args.metaeukDb #gene prediction tool augustus = args.augustus @@ -964,6 +970,12 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' check_path(assemblyDir) + if metaeuk_db != '': + if not metaeuk_db.endswith('/'): + metaeuk_db = metaeuk_db + '/' + check_path(metaeuk_db) + + try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: @@ -1045,7 +1057,7 @@ def main(): cpus = mp.cpu_count() pool = mp.Pool(cpus) for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) results = (pool.imap_unordered(ortholog_search_tblastn, calls)) pool.close() @@ -1057,7 +1069,7 @@ def main(): else: ###################### computation species wise ################ for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) for k in output_ortholog_search: From 81ec9a562d52b9546fd4c7161e89725b9e23783a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 11 Feb 2022 11:47:31 +0100 Subject: [PATCH 182/192] bugfix --- fdog/fDOGassembly.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 20b74e3..daf8bff 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -971,8 +971,6 @@ def main(): check_path(assemblyDir) if metaeuk_db != '': - if not metaeuk_db.endswith('/'): - metaeuk_db = metaeuk_db + '/' check_path(metaeuk_db) From 17a546a155cf5efa09f7c8e16c888a10a9d65615 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 14 Feb 2022 14:40:39 +0100 Subject: [PATCH 183/192] for debugging function get_distance_biopython --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index daf8bff..4a05627 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -313,6 +313,7 @@ def searching_for_db(assembly_path): return check def get_distance_biopython(file, matrix): + print(file) aln = AlignIO.read(open(file), 'fasta') calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) From c260ce4b1fceabf421dbf2c2b459ee2ea92978f7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 14 Feb 2022 15:19:06 +0100 Subject: [PATCH 184/192] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4a05627..664e429 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -313,7 +313,7 @@ def searching_for_db(assembly_path): return check def get_distance_biopython(file, matrix): - print(file) + #print(file) aln = AlignIO.read(open(file), 'fasta') calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) @@ -637,7 +637,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: for name in candidate_names: if name in record.id: - f.write(">" + name + "\n") + f.write(">" + record.id + "\n") f.write(str(record.seq) + "\n") f.close() From 0ec76787dffb4a5aa6b8ab0304992775f382335d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 23 Feb 2022 10:47:17 +0100 Subject: [PATCH 185/192] bug fix, testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 664e429..ec41ec2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -636,7 +636,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: for name in candidate_names: - if name in record.id: + if name == record.id: f.write(">" + record.id + "\n") f.write(str(record.seq) + "\n") f.close() From 76e503819d7376a59a0a71b8fe9a3c548ad6ecf5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 24 Feb 2022 11:04:24 +0100 Subject: [PATCH 186/192] bug fix --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ec41ec2..0aead0e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -634,11 +634,14 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break + already_written = [] for record in candidates: for name in candidate_names: if name == record.id: - f.write(">" + record.id + "\n") - f.write(str(record.seq) + "\n") + if name not in already_written: + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + already_written.append(name) f.close() if msaTool == "muscle": From ad12f0aaa68e331847b1e4379cb62cae56c2f729 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 11:22:44 +0100 Subject: [PATCH 187/192] gff file positions were corrected during fDOG-Assembly run --- fdog/fDOGassembly.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0aead0e..d7a8e37 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -268,6 +268,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n") for key in regions: locations = regions[key] @@ -279,6 +281,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group end = str(i[1] + length_extension) name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + region.write(file + "\t" + str(start) + "\t" + str(end)) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) @@ -298,6 +301,15 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group else: output.write(line) sequence_file.close() + + gff_file = open(tmp_path + name + ".gff", "r") + lines = gff_file.readlines() + for line in lines: + values = line.split("\t") + values[3] = int(values[3]) + int(start) + values[4] = int(values[4]) + int(start) + gff_file.write("\t".join(values)) + gff_file.close() except FileNotFoundError: pass From 6b15f26c04e30b3516d2b560527498c255474e74 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 13:56:48 +0100 Subject: [PATCH 188/192] bug fix --- fdog/fDOGassembly.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d7a8e37..e8ed0ee 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -303,12 +303,12 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group sequence_file.close() gff_file = open(tmp_path + name + ".gff", "r") - lines = gff_file.readlines() - for line in lines: - values = line.split("\t") - values[3] = int(values[3]) + int(start) - values[4] = int(values[4]) + int(start) - gff_file.write("\t".join(values)) + lines = gff_file.readlines() + for line in lines: + values = line.split("\t") + values[3] = int(values[3]) + int(start) + values[4] = int(values[4]) + int(start) + gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: pass From 7d7504f1f76e01a4cd27cad5a371ef3c6cc7bcf4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 14:18:39 +0100 Subject: [PATCH 189/192] bug fix --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e8ed0ee..051f331 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -306,8 +306,8 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group lines = gff_file.readlines() for line in lines: values = line.split("\t") - values[3] = int(values[3]) + int(start) - values[4] = int(values[4]) + int(start) + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: From 826d676f3846cfa16a6fbba5cdba0d066e158023 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 14:40:52 +0100 Subject: [PATCH 190/192] bug fix --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 051f331..3770e9b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -306,8 +306,10 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group lines = gff_file.readlines() for line in lines: values = line.split("\t") - values[3] = str(int(values[3]) + int(start)) - values[4] = str(int(values[4]) + int(start)) + new_start = int(values[3]) + int(start) + values[3] = str(new_start) + new_end = int(values[4]) + int(start) + values[4] = str(new_end) gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: From 8a832fc1c67161e9361a94bc29f32d9863e284a0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 15:00:56 +0100 Subject: [PATCH 191/192] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3770e9b..2168b5d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -302,7 +302,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group output.write(line) sequence_file.close() - gff_file = open(tmp_path + name + ".gff", "r") + gff_file = open(tmp_path + name + ".gff", "r+") lines = gff_file.readlines() for line in lines: values = line.split("\t") From 14c852c8ed8b53d5f2007820406084ac72908dea Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 15:34:59 +0100 Subject: [PATCH 192/192] bug fix --- fdog/fDOGassembly.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2168b5d..7027236 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -302,15 +302,18 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group output.write(line) sequence_file.close() - gff_file = open(tmp_path + name + ".gff", "r+") + gff_file = open(tmp_path + name + ".gff", "r") lines = gff_file.readlines() + new_lines = [] for line in lines: values = line.split("\t") - new_start = int(values[3]) + int(start) - values[3] = str(new_start) - new_end = int(values[4]) + int(start) - values[4] = str(new_end) - gff_file.write("\t".join(values)) + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) + new_lines.append("\t".join(values)) + gff_file.close() + gff_file = open(tmp_path + name + ".gff", "w") + for line in new_lines: + gff_file.write(line) gff_file.close() except FileNotFoundError: pass