From 9782bec7cfdb3d20ab631d2d18fcf8732f184f02 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 1 Apr 2021 11:38:04 +0200
Subject: [PATCH 001/192] bug fix runSingle.py

---
 fdog/fDOGassembly.py | 4 +++-
 fdog/runSingle.py    | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b802b26..f207516 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -597,6 +597,9 @@ def main():
     if core_path == '':
         core_path = out + '/core_orthologs/'
 
+    print(assemblyDir)
+    
+
 
     # user input has to be checked here before fDOGassembly continues
 
@@ -725,7 +728,6 @@ def main():
                 return 1
 
     ################## checking accepted genes for co-orthologs ##########################
-        print(reciprocal_sequences)
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
 
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index 34d7fc1..a0ded09 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -437,8 +437,8 @@ def main():
                 assemblypath = cfg['assemblypath']
             except:
                 sys.exit('assemblypath not found in %s' % pathFile)
-        if assembly == True:
-            searchpath = assemblypath
+    if assembly == True:
+        searchpath = assemblypath
 
     ### check input arguments
     seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath])

From e56d87ac8f9b7dc5240ea9e6e090ca303648fdd1 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 10:04:04 +0200
Subject: [PATCH 002/192] cleaning output

---
 fdog/fDOGassembly.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f207516..515ddfe 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -60,7 +60,7 @@ def parse_blast(line, blast_results):
     #print(line)
     line = line.replace("\n", "")
     line_info = line.split("\t")
-    #print(line_info)
+    print(line_info)
     evalue = float(line_info[3])
 
     #cut off
@@ -598,7 +598,7 @@ def main():
         core_path = out + '/core_orthologs/'
 
     print(assemblyDir)
-    
+
 
 
     # user input has to be checked here before fDOGassembly continues
@@ -620,7 +620,7 @@ def main():
 
     ###################### create tmp folder ###################################
 
-    os.system('mkdir ' + out + '/tmp')
+    os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1')
 
     ######################## consensus sequence ################################
 
@@ -659,7 +659,7 @@ def main():
             searchBool = True
 
         ################### path definitions ###################################
-        os.system('mkdir ' + out + '/tmp/' + asName)
+        os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1')
         tmp_path = out + "/tmp/" + asName + "/"
         candidatesOutFile = tmp_path + group + ".candidates.fa"
         if searchTaxon != '':
@@ -740,7 +740,7 @@ def main():
         if searchTaxon != '' and fasoff == False:
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-            os.system('mkdir ' + tmp_path + 'anno_dir')
+            os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1')
             os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName )
 
 

From 766c89d34b02723403bd4a03296f30785d6c4feb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 10:56:26 +0200
Subject: [PATCH 003/192] testing

---
 fdog/fDOGassembly.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 515ddfe..d06e2bc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -31,6 +31,11 @@ def merge(blast_results, insert_length):
             i = 1
             while i < size_list-1:
 
+                a = locations[j][0]
+                b = locations[i][0]
+                c = locations[j][1]
+                d = locations[j][5]
+                e = locations[i][5]
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -60,7 +65,7 @@ def parse_blast(line, blast_results):
     #print(line)
     line = line.replace("\n", "")
     line_info = line.split("\t")
-    print(line_info)
+    #print(line_info)
     evalue = float(line_info[3])
 
     #cut off
@@ -597,7 +602,7 @@ def main():
     if core_path == '':
         core_path = out + '/core_orthologs/'
 
-    print(assemblyDir)
+    #print(assemblyDir)
 
 
 

From 48e41540d6ba403d974219a54c1563436ac54661 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:00:31 +0200
Subject: [PATCH 004/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d06e2bc..c317d8a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,7 +27,7 @@ def merge(blast_results, insert_length):
 
         j = 0
 
-        while j < size_list-1:
+        while j < size_list-2:
             i = 1
             while i < size_list-1:
 

From 47f45d61f2875f61822e12f310e4b07d5eec20df Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:19:14 +0200
Subject: [PATCH 005/192] testing

---
 fdog/fDOGassembly.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c317d8a..be7edaf 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -599,6 +599,9 @@ def main():
         assemblyDir = dataPath + '/assembly_dir/'
     if out == '':
         out = os.getcwd()
+    else:
+        if not os.path.exists(out + '/group'):
+            os.system('mkdir ' + out + '/group')
     if core_path == '':
         core_path = out + '/core_orthologs/'
 

From fe44e0bf0458909febf5e5c9bec2fecd85c5f7ee Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:21:15 +0200
Subject: [PATCH 006/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index be7edaf..98e6480 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -599,6 +599,7 @@ def main():
         assemblyDir = dataPath + '/assembly_dir/'
     if out == '':
         out = os.getcwd()
+        os.system('mkdir ' + out + '/group')
     else:
         if not os.path.exists(out + '/group'):
             os.system('mkdir ' + out + '/group')

From 34e87cac0ca8f4b4c24ef807223c2b7cecaa0dbc Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:28:57 +0200
Subject: [PATCH 007/192] testing

---
 fdog/fDOGassembly.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 98e6480..e4434bc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -599,10 +599,12 @@ def main():
         assemblyDir = dataPath + '/assembly_dir/'
     if out == '':
         out = os.getcwd()
-        os.system('mkdir ' + out + '/group')
+        os.system('mkdir ' + out + '/' + group)
+        out = out + '/' + group
     else:
-        if not os.path.exists(out + '/group'):
-            os.system('mkdir ' + out + '/group')
+        if not os.path.exists(out + '/' + group):
+            os.system('mkdir ' + out + '/' + group)
+        out = out + '/' + group
     if core_path == '':
         core_path = out + '/core_orthologs/'
 

From 32bce0eb9c9d9e1193ea2a668240fbab0f5be18d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:32:12 +0200
Subject: [PATCH 008/192] testing

---
 fdog/fDOGassembly.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e4434bc..ae77ac3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -601,10 +601,6 @@ def main():
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group
-    else:
-        if not os.path.exists(out + '/' + group):
-            os.system('mkdir ' + out + '/' + group)
-        out = out + '/' + group
     if core_path == '':
         core_path = out + '/core_orthologs/'
 

From a8362e35e18c6f298227cd658b4a33ed5d6b3e8a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:38:23 +0200
Subject: [PATCH 009/192] testing

---
 fdog/fDOGassembly.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ae77ac3..d476d7a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -566,18 +566,8 @@ def main():
     #     print(out + "fdog.log \n")
     #     sys.stdout = Logger(out)
 
-    try:
-        f = open(out + "fdog.log", "a+")
-    except FileNotFoundError:
-        f = open(out + "fdog.log", "w")
 
 
-    if silent == True:
-        sys.stderr = f
-        sys.stdout = f
-    else:
-        sys.stdout = Logger(f)
-
 
     #checking paths
     if dataPath == '':
@@ -605,6 +595,17 @@ def main():
         core_path = out + '/core_orthologs/'
 
     #print(assemblyDir)
+    try:
+        f = open(out + "fdog.log", "a+")
+    except FileNotFoundError:
+        f = open(out + "fdog.log", "w")
+
+
+    if silent == True:
+        sys.stderr = f
+        sys.stdout = f
+    else:
+        sys.stdout = Logger(f)
 
 
 

From 0458c252acb2c4077c551dcb3ddf361494617251 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:42:57 +0200
Subject: [PATCH 010/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d476d7a..f54a654 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -594,6 +594,7 @@ def main():
     if core_path == '':
         core_path = out + '/core_orthologs/'
 
+    print(out)
     #print(assemblyDir)
     try:
         f = open(out + "fdog.log", "a+")

From 1b07c9017814ece00a64adad8a97aac00e1ec89a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:54:09 +0200
Subject: [PATCH 011/192] testing

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f54a654..d0b8610 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -591,10 +591,12 @@ def main():
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group
+
     if core_path == '':
         core_path = out + '/core_orthologs/'
 
     print(out)
+    print("test " + group + "\n" )
     #print(assemblyDir)
     try:
         f = open(out + "fdog.log", "a+")

From 2d3f8dda146d10082186dc1dce395c87f0949505 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:56:36 +0200
Subject: [PATCH 012/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d0b8610..a00876c 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -599,9 +599,9 @@ def main():
     print("test " + group + "\n" )
     #print(assemblyDir)
     try:
-        f = open(out + "fdog.log", "a+")
+        f = open(out + "/fdog.log", "a+")
     except FileNotFoundError:
-        f = open(out + "fdog.log", "w")
+        f = open(out + "/fdog.log", "w")
 
 
     if silent == True:

From afec218d459c6cc181dd80a7eebe7e41a74754a9 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 11:59:34 +0200
Subject: [PATCH 013/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a00876c..33def84 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -588,6 +588,7 @@ def main():
     if assemblyDir == '':
         assemblyDir = dataPath + '/assembly_dir/'
     if out == '':
+        print('test out \n')
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group

From 9983e001ab8704188bf74168faf56b0e74a76def Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 8 Apr 2021 12:05:45 +0200
Subject: [PATCH 014/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 33def84..87749bf 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -530,7 +530,7 @@ def main():
     assemblyDir = args.assemblyPath
     dataPath = args.dataPath
     core_path = args.coregroupPath
-    out = args.out + "/"
+    out = args.out
     pathFile = args.pathFile
     #I/O
     tmp = args.tmp
@@ -591,7 +591,7 @@ def main():
         print('test out \n')
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group)
-        out = out + '/' + group
+        out = out + '/' + group + '/'
 
     if core_path == '':
         core_path = out + '/core_orthologs/'

From 4cca757f6fec7ffbe309996c5b6a8bc98a48a866 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 9 Apr 2021 13:18:56 +0200
Subject: [PATCH 015/192] bug fix if augutus can't idetify a gene at a
 candidate region

---
 fdog/fDOGassembly.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 87749bf..03af975 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -146,16 +146,21 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
             result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
 
-            sequence_file = open(tmp_path + name + ".aa", "r")
-            lines = sequence_file.readlines()
-            for line in lines:
-                if line[0] == ">":
-                    id = line.replace(">", "")
-                    header = ">" + group + "|" + ass_name + "|" + name + "_" + id
-                    output.write(header)
-                else:
-                    output.write(line)
-            sequence_file.close()
+            try:
+                sequence_file = open(tmp_path + name + ".aa", "r")
+                lines = sequence_file.readlines()
+                for line in lines:
+                    if line[0] == ">":
+                        id = line.replace(">", "")
+                        header = ">" + group + "|" + ass_name + "|" + name + "_" + id
+                        output.write(header)
+                    else:
+                        output.write(line)
+                sequence_file.close()
+            except FileNotFoundError:
+                print("No gene found by ID:" + name +" , continuing with next region")
+
+
 
     output.close()
 

From d9bb72dcd0e1e359417d36edbc69de201aa29da6 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 9 Apr 2021 14:07:44 +0200
Subject: [PATCH 016/192] testing

---
 fdog/fDOGassembly.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 03af975..8aa5f74 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -589,6 +589,8 @@ def main():
                 dataPath = cfg['dataPath']
             except:
                 dataPath = 'config'
+    if core_path == '':
+        core_path = out + '/core_orthologs/'
 
     if assemblyDir == '':
         assemblyDir = dataPath + '/assembly_dir/'
@@ -598,8 +600,7 @@ def main():
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group + '/'
 
-    if core_path == '':
-        core_path = out + '/core_orthologs/'
+
 
     print(out)
     print("test " + group + "\n" )
@@ -659,9 +660,11 @@ def main():
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
         new_path = core_path + group +"/"+ group + "_new.aln"
+        print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
+        print(cmd)
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
         print("block profile is finished \n")
 

From ddec3f0909fb9695c90569b674084b4826a7aa9c Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 9 Apr 2021 14:13:53 +0200
Subject: [PATCH 017/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8aa5f74..e309f33 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -659,7 +659,7 @@ def main():
         print("block profile is finished \n")
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
-        new_path = core_path + group +"/"+ group + "_new.aln"
+        new_path = core_path + "/" + group +"/"+ group + "_new.aln"
         print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)

From 13aea2d3c2233ba9b32e857275d7e39d6574a2a0 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 9 Apr 2021 14:16:14 +0200
Subject: [PATCH 018/192] bug fix

---
 fdog/fDOGassembly.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e309f33..1691ac9 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -591,6 +591,9 @@ def main():
                 dataPath = 'config'
     if core_path == '':
         core_path = out + '/core_orthologs/'
+    else:
+        if not core_path.endswith('/'):
+            core_path = core_path + '/'
 
     if assemblyDir == '':
         assemblyDir = dataPath + '/assembly_dir/'
@@ -660,11 +663,11 @@ def main():
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
         new_path = core_path + "/" + group +"/"+ group + "_new.aln"
-        print(cmd)
+        #print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
-        print(cmd)
+        #print(cmd)
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
         print("block profile is finished \n")
 

From 89a8843fd1c80f2fddb690f5f0505dbf6f8293ba Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 9 Apr 2021 14:17:13 +0200
Subject: [PATCH 019/192] cleaning up

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1691ac9..6ba8aa6 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -662,7 +662,7 @@ def main():
         print("block profile is finished \n")
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
-        new_path = core_path + "/" + group +"/"+ group + "_new.aln"
+        new_path = core_path + group +"/"+ group + "_new.aln"
         #print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)

From 116acad39a7af8c56941b47a55fa96285ca7a132 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 09:50:21 +0200
Subject: [PATCH 020/192] testing

---
 fdog/fDOGassembly.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 6ba8aa6..27dc85b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -428,6 +428,7 @@ def checkOptions():
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     candidates = readFasta(candidatesFile)
     ref = readFasta(fasta)
+    print(candidate_name)
 
     out = tmp_path + '/checkCoorthologs.fa'
     f = open(out,"w")
@@ -441,8 +442,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             f.write(str(record.seq) +  "\n")
             break
 
+
     for record in candidates:
+        print(record.id + "ID\n")
         for name in candidate_names:
+            print(name + "name\n")
             if name in record.id:
                 f.write(">" + name + "\n")
                 f.write(str(record.seq) + "\n")
@@ -604,9 +608,6 @@ def main():
         out = out + '/' + group + '/'
 
 
-
-    print(out)
-    print("test " + group + "\n" )
     #print(assemblyDir)
     try:
         f = open(out + "/fdog.log", "a+")

From 0078ee440f5e933bb81c6fb4eb12b88e788b05e0 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 10:03:56 +0200
Subject: [PATCH 021/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 27dc85b..3d7a243 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -428,7 +428,7 @@ def checkOptions():
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     candidates = readFasta(candidatesFile)
     ref = readFasta(fasta)
-    print(candidate_name)
+    print(candidate_names)
 
     out = tmp_path + '/checkCoorthologs.fa'
     f = open(out,"w")

From c03e59dab4caf920874263fe4c6bc78ba4b36c25 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 10:12:08 +0200
Subject: [PATCH 022/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3d7a243..f4034b6 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -376,6 +376,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
 
 
     #print(orthologs)
+    orthologs = set(orthologs)
     return list(orthologs), seed
 
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):

From 366a4ab858870057f7df27f4bfc2ad99134932eb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 12:18:15 +0200
Subject: [PATCH 023/192] testing

---
 fdog/fDOGassembly.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f4034b6..d751f53 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -22,7 +22,7 @@ def merge(blast_results, insert_length):
         locations = blast_results[key]
         locations = sorted(locations, key = lambda x: int(x[3]))
         #print("test")
-        #print(locations)
+        print(locations)
         size_list = len(locations)
 
         j = 0
@@ -59,23 +59,19 @@ def merge(blast_results, insert_length):
     #print(blast_results)
     return blast_results, number_regions
 
-def parse_blast(line, blast_results):
-    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend> <strand>
-    #fomrat dictionary: {node_name: [(<start>,<end>)]}
-    #print(line)
+def parse_blast(line, blast_results, cutoff):
+    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend>
+    #fomrat dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
-    #print(line_info)
     evalue = float(line_info[3])
-
     #cut off
-    if evalue > 0.00001:
+    if evalue > cutoff:
         return blast_results, evalue
     #add region to dictionary
     else:
         node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5]
         split = node_name.split("|")
-
         # finding out on which strand tBLASTn founded a hit
         if sstart < send:
             strand = "+"
@@ -83,7 +79,6 @@ def parse_blast(line, blast_results):
             sstart = line_info[2]
             send = line_info[1]
             strand = "-"
-
         #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001
         if len(split) > 1:
             node_name = split[1]
@@ -96,7 +91,7 @@ def parse_blast(line, blast_results):
 
     return blast_results, evalue
 
-def candidate_regions(intron_length, evalue, tmp_path):
+def candidate_regions(intron_length, cutoff_evalue, tmp_path):
     ###################### extracting candidate regions ########################
     # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
     blast_file = open(tmp_path + "/blast_results.out", "r")
@@ -109,9 +104,9 @@ def candidate_regions(intron_length, evalue, tmp_path):
         if not line:
             break
         #parsing blast output
-        blast_results, evalue = parse_blast(line, blast_results)
+        blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
         #evalue cut-off
-        if not evalue <= evalue:
+        if not evalue <= cutoff_evalue:
             break
     if blast_results == {}:
         return 0,0
@@ -429,7 +424,6 @@ def checkOptions():
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     candidates = readFasta(candidatesFile)
     ref = readFasta(fasta)
-    print(candidate_names)
 
     out = tmp_path + '/checkCoorthologs.fa'
     f = open(out,"w")
@@ -445,9 +439,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
 
     for record in candidates:
-        print(record.id + "ID\n")
         for name in candidate_names:
-            print(name + "name\n")
             if name in record.id:
                 f.write(">" + name + "\n")
                 f.write(str(record.seq) + "\n")
@@ -603,7 +595,7 @@ def main():
     if assemblyDir == '':
         assemblyDir = dataPath + '/assembly_dir/'
     if out == '':
-        print('test out \n')
+        #print('test out \n')
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group + '/'

From 79f2b67802f76f5a3fbb003efbe9fd39f7db70df Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 13:43:34 +0200
Subject: [PATCH 024/192] bug fix in merge function, regions in minus strand
 were not merged correctly

---
 fdog/fDOGassembly.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d751f53..a3480a3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -36,27 +36,42 @@ def merge(blast_results, insert_length):
                 c = locations[j][1]
                 d = locations[j][5]
                 e = locations[i][5]
-                if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])):
+                if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5])):
+                elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                    #merge overlapping regions
+                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -= 1
+                elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #print(j)
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
+                elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                    #print(j)
+                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -=1
+
                 i += 1
             j += 1
 
         number_regions += len(locations)
         blast_results[key] = locations
 
-    #print(blast_results)
+    print(blast_results)
     return blast_results, number_regions
 
 def parse_blast(line, blast_results, cutoff):

From 5425cd138dc47933a0f019896e1fe5db337d5ad0 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 14:09:51 +0200
Subject: [PATCH 025/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a3480a3..9694c6d 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -45,7 +45,7 @@ def merge(blast_results, insert_length):
                     i -= 1
                 elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #merge overlapping regions
-                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1

From 174cc0c834c6ea1c9fb89b553dfed24e89570778 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 14:10:11 +0200
Subject: [PATCH 026/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 9694c6d..be67237 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -59,7 +59,7 @@ def merge(blast_results, insert_length):
                     i -=1
                 elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #print(j)
-                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1

From ccc3e4eb0d0aae6eedae7b61e0ab1761ebcf31a2 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 14:28:53 +0200
Subject: [PATCH 027/192] testing

---
 fdog/fDOGassembly.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index be67237..7ae65c0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -31,11 +31,6 @@ def merge(blast_results, insert_length):
             i = 1
             while i < size_list-1:
 
-                a = locations[j][0]
-                b = locations[i][0]
-                c = locations[j][1]
-                d = locations[j][5]
-                e = locations[i][5]
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -79,6 +74,7 @@ def parse_blast(line, blast_results, cutoff):
     #fomrat dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
+    print(line_info)
     evalue = float(line_info[3])
     #cut off
     if evalue > cutoff:
@@ -87,14 +83,14 @@ def parse_blast(line, blast_results, cutoff):
     else:
         node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5]
         split = node_name.split("|")
-        # finding out on which strand tBLASTn founded a hit
+        # finding out on which strand tBLASTn found a hit
         if sstart < send:
             strand = "+"
         else:
             sstart = line_info[2]
             send = line_info[1]
             strand = "-"
-        #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001
+        #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off
         if len(split) > 1:
             node_name = split[1]
         if node_name in blast_results:

From e2cb392d31015b99f49cca38b68f7cfacb28e7a6 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 14:35:26 +0200
Subject: [PATCH 028/192] testing

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 7ae65c0..95fe32b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -81,14 +81,14 @@ def parse_blast(line, blast_results, cutoff):
         return blast_results, evalue
     #add region to dictionary
     else:
-        node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5]
+        node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5])
         split = node_name.split("|")
         # finding out on which strand tBLASTn found a hit
         if sstart < send:
             strand = "+"
         else:
-            sstart = line_info[2]
-            send = line_info[1]
+            sstart = int(line_info[2])
+            send = int(line_info[1])
             strand = "-"
         #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off
         if len(split) > 1:

From 6c9b25828e68d0a5dc79f7ed2dd28fcfb3d42aa4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:07:17 +0200
Subject: [PATCH 029/192] testing

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 95fe32b..f8d6487 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -38,7 +38,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #merge overlapping regions
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
@@ -52,7 +52,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -=1
-                elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #print(j)
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
@@ -74,7 +74,7 @@ def parse_blast(line, blast_results, cutoff):
     #fomrat dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
-    print(line_info)
+    #print(line_info)
     evalue = float(line_info[3])
     #cut off
     if evalue > cutoff:

From b9c055ead8880df456dd1c5fc154bd79c0051f0b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:21:29 +0200
Subject: [PATCH 030/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f8d6487..996bec6 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,7 +27,7 @@ def merge(blast_results, insert_length):
 
         j = 0
 
-        while j < size_list-2:
+        while j < size_list-1:
             i = 1
             while i < size_list-1:
 

From 79df315ba23f40bf8205221880a062d81f48b8ed Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:30:42 +0200
Subject: [PATCH 031/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 996bec6..f4da667 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -18,6 +18,7 @@ def load_config(config_file):
 
 def merge(blast_results, insert_length):
     number_regions = 0
+    insert_length = int(insert_length)
     for key in blast_results:
         locations = blast_results[key]
         locations = sorted(locations, key = lambda x: int(x[3]))

From 0bc70a06235d836dd3c91ff98e2c16de16473364 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:36:56 +0200
Subject: [PATCH 032/192] testing

---
 fdog/fDOGassembly.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f4da667..02ff236 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,11 +27,9 @@ def merge(blast_results, insert_length):
         size_list = len(locations)
 
         j = 0
-
         while j < size_list-1:
-            i = 1
+            i = j+1
             while i < size_list-1:
-
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -46,7 +44,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
+                elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #print(j)
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])

From a31d5e9acf2fcac7d1d588af42edbc22e6219bdf Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:49:20 +0200
Subject: [PATCH 033/192] testing

---
 fdog/fDOGassembly.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 02ff236..d4e0518 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,9 +27,12 @@ def merge(blast_results, insert_length):
         size_list = len(locations)
 
         j = 0
-        while j < size_list-1:
-            i = j+1
+        while j < size_list-2:
+            i = j + 1
             while i < size_list-1:
+                print("Vergleich \n")
+                print(locations[j] + "\n")
+                print(locations[i] + "\n")
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -37,6 +40,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
+                    print("M+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #merge overlapping regions
                     locations[j][0] = min(locations[j][0], locations[i][0])
@@ -44,6 +48,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
+                    print("M-")
                 elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #print(j)
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -51,6 +56,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -=1
+                    print("Insert+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #print(j)
                     locations[j][0] = min(locations[j][0], locations[i][0])
@@ -58,6 +64,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -=1
+                    print("Insert-")
 
                 i += 1
             j += 1

From 55137f49c3e4ba3986239084cbe002713257a888 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 15:52:54 +0200
Subject: [PATCH 034/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d4e0518..b3d78f9 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -31,8 +31,8 @@ def merge(blast_results, insert_length):
             i = j + 1
             while i < size_list-1:
                 print("Vergleich \n")
-                print(locations[j] + "\n")
-                print(locations[i] + "\n")
+                print(str(locations[j]) + "\n")
+                print(str(locations[i]) + "\n")
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])

From ab85180e94e60515963a1190386c0c68ed39e771 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 12 Apr 2021 16:00:17 +0200
Subject: [PATCH 035/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b3d78f9..f1e3771 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,9 +27,9 @@ def merge(blast_results, insert_length):
         size_list = len(locations)
 
         j = 0
-        while j < size_list-2:
+        while j < size_list-1:
             i = j + 1
-            while i < size_list-1:
+            while i < size_list:
                 print("Vergleich \n")
                 print(str(locations[j]) + "\n")
                 print(str(locations[i]) + "\n")

From f66f72c5638323cc7d22b6f73bea38ce20f6cf2b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 13 Apr 2021 09:47:12 +0200
Subject: [PATCH 036/192] clean up

---
 fdog/fDOGassembly.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f1e3771..3b499a1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -23,16 +23,16 @@ def merge(blast_results, insert_length):
         locations = blast_results[key]
         locations = sorted(locations, key = lambda x: int(x[3]))
         #print("test")
-        print(locations)
+        #print(locations)
         size_list = len(locations)
 
         j = 0
         while j < size_list-1:
             i = j + 1
             while i < size_list:
-                print("Vergleich \n")
-                print(str(locations[j]) + "\n")
-                print(str(locations[i]) + "\n")
+                #print("Vergleich \n")
+                #print(str(locations[j]) + "\n")
+                #print(str(locations[i]) + "\n")
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #merge overlapping regions
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -40,7 +40,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                    print("M+")
+                    #print("M+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #merge overlapping regions
                     locations[j][0] = min(locations[j][0], locations[i][0])
@@ -48,7 +48,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                    print("M-")
+                    #print("M-")
                 elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
                     #print(j)
                     locations[j][1] = max(locations[j][1], locations[i][1])
@@ -56,7 +56,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -=1
-                    print("Insert+")
+                    #print("Insert+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
                     #print(j)
                     locations[j][0] = min(locations[j][0], locations[i][0])
@@ -64,7 +64,7 @@ def merge(blast_results, insert_length):
                     locations.pop(i)
                     size_list -= 1
                     i -=1
-                    print("Insert-")
+                    #print("Insert-")
 
                 i += 1
             j += 1
@@ -72,7 +72,7 @@ def merge(blast_results, insert_length):
         number_regions += len(locations)
         blast_results[key] = locations
 
-    print(blast_results)
+    #print(blast_results)
     return blast_results, number_regions
 
 def parse_blast(line, blast_results, cutoff):

From f573dc4776fac4a9df2513191bcae389f365a9c1 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 11:33:44 +0200
Subject: [PATCH 037/192] testing

---
 fdog/fDOGassembly.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3b499a1..c64a244 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -77,7 +77,7 @@ def merge(blast_results, insert_length):
 
 def parse_blast(line, blast_results, cutoff):
     # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend>
-    #fomrat dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
+    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
     #print(line_info)
@@ -123,7 +123,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         #parsing blast output
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
         #evalue cut-off
+        print(evalue + " evalue candidate region \n")
+        print(cutoff + " cutoff evalue \n")
         if not evalue <= cutoff_evalue:
+            print("break \n")
             break
     if blast_results == {}:
         return 0,0

From 4dad8869a6ee3d5d013d1bbe4586f161455f19c6 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 11:41:24 +0200
Subject: [PATCH 038/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c64a244..126decf 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -131,6 +131,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
     if blast_results == {}:
         return 0,0
     else:
+        print(blast_results)
         candidate_regions, number_regions = merge(blast_results, intron_length)
         #candidate_regions, number_regions = merge_regions(blast_results, cut_off)
         #print(candidate_regions, number_regions)

From ef9c17fda354bc4cd5b7954f6f93d4cadf360aba Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 11:43:08 +0200
Subject: [PATCH 039/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 126decf..c69733a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -123,8 +123,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         #parsing blast output
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
         #evalue cut-off
-        print(evalue + " evalue candidate region \n")
-        print(cutoff + " cutoff evalue \n")
+        print(str(evalue) + " evalue candidate region \n")
+        print(str(cutoff) + " cutoff evalue \n")
         if not evalue <= cutoff_evalue:
             print("break \n")
             break

From e5b06e1d279195a08c6e94b79dafec192b0d82f4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 11:44:49 +0200
Subject: [PATCH 040/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c69733a..1b1c5f7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -124,7 +124,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
         #evalue cut-off
         print(str(evalue) + " evalue candidate region \n")
-        print(str(cutoff) + " cutoff evalue \n")
+        print(str(cutoff_evalue) + " cutoff evalue \n")
         if not evalue <= cutoff_evalue:
             print("break \n")
             break

From 7e0377db68470f2a2cdaefa308f1def70250fcbf Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 12:44:50 +0200
Subject: [PATCH 041/192] bug fix

---
 fdog/fDOGassembly.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1b1c5f7..0485db0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -122,12 +122,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
             break
         #parsing blast output
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
-        #evalue cut-off
-        print(str(evalue) + " evalue candidate region \n")
-        print(str(cutoff_evalue) + " cutoff evalue \n")
-        if not evalue <= cutoff_evalue:
-            print("break \n")
-            break
+
     if blast_results == {}:
         return 0,0
     else:
@@ -731,7 +726,7 @@ def main():
     #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 
         print("tBLASTn search against data base")
-        os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -out ' + tmp_path + '/blast_results.out')
+        os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out')
         print("tBLASTn search is finished")
 
     ################### search for candidate regions and extract seq ###########

From 721cfffea9d3837bb49b1a52c91dc6d362f18474 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 12:49:12 +0200
Subject: [PATCH 042/192] testing new tblastn call

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 0485db0..8f47d98 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -726,7 +726,7 @@ def main():
     #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 
         print("tBLASTn search against data base")
-        os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out')
+        os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out')
         print("tBLASTn search is finished")
 
     ################### search for candidate regions and extract seq ###########

From 496bb1f8c1dd1b0b158d36c99b567faeae7e67ca Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 15 Apr 2021 12:55:54 +0200
Subject: [PATCH 043/192] testing

---
 fdog/fDOGassembly.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8f47d98..b9ee3f4 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -126,9 +126,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
     if blast_results == {}:
         return 0,0
     else:
-        print(blast_results)
         candidate_regions, number_regions = merge(blast_results, intron_length)
-        #candidate_regions, number_regions = merge_regions(blast_results, cut_off)
+        print(candidate_regions)
         #print(candidate_regions, number_regions)
         return candidate_regions, number_regions
 

From 2cdc82d53fc5dab4a82ddd9e03fbccc0d003d399 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 16 Apr 2021 10:19:54 +0200
Subject: [PATCH 044/192] testing

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b9ee3f4..5d2f9e9 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -127,7 +127,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         return 0,0
     else:
         candidate_regions, number_regions = merge(blast_results, intron_length)
-        print(candidate_regions)
+        #print(candidate_regions)
         #print(candidate_regions, number_regions)
         return candidate_regions, number_regions
 
@@ -750,7 +750,7 @@ def main():
     ################# backward search to filter for orthologs###################
         reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path)
 
-
+        print(reciprocal_sequences)
         if reciprocal_sequences == 0:
             print("No ortholog fulfilled the reciprocity criteria")
             if searchTaxon == '':
@@ -761,7 +761,7 @@ def main():
 
     ################## checking accepted genes for co-orthologs ##########################
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
-
+        print(reciprocal_sequences)
 
 
     ################ add sequences to extended.fa in the output folder##########

From 3898d4ee8869332c76250593c2e2c391ad933c46 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 16 Apr 2021 10:27:00 +0200
Subject: [PATCH 045/192] testing

---
 fdog/fDOGassembly.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 5d2f9e9..842d67f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -437,6 +437,9 @@ def checkOptions():
     #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
 
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
+    if len(candidate_name) == 1:
+        return candidate_name
+
     candidates = readFasta(candidatesFile)
     ref = readFasta(fasta)
 

From e1fec1af78f1f59e43d4c4f1be83cbbfa67b661d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 16 Apr 2021 10:28:40 +0200
Subject: [PATCH 046/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 842d67f..d31af58 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -437,8 +437,8 @@ def checkOptions():
     #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
 
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
-    if len(candidate_name) == 1:
-        return candidate_name
+    if len(candidate_names) == 1:
+        return candidate_names
 
     candidates = readFasta(candidatesFile)
     ref = readFasta(fasta)

From 65c1e1e0ae34b9bb948de5e2511cca1cc29f6781 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 18 Apr 2021 19:40:44 +0200
Subject: [PATCH 047/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d31af58..80582bc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -477,11 +477,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     for name in candidate_names:
         distance = distances[ref_id , name]
-        if distance < min_dist:
+        if distance <= min_dist:
             min_dist = distance
             min_name = name
 
-    checked = []
+
 
 
     for name in candidate_names:

From 34b2ee591f76296e48dfe27bdd46d6e3d6e666fd Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 20 Apr 2021 15:47:17 +0200
Subject: [PATCH 048/192] code clean up

---
 fdog/fDOGassembly.py | 120 ++++++++++++-------------------------------
 1 file changed, 33 insertions(+), 87 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 80582bc..44e7607 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -17,6 +17,7 @@ def load_config(config_file):
             print(exc)
 
 def merge(blast_results, insert_length):
+    #merging overlapping and contigous candidate regions
     number_regions = 0
     insert_length = int(insert_length)
     for key in blast_results:
@@ -25,54 +26,44 @@ def merge(blast_results, insert_length):
         #print("test")
         #print(locations)
         size_list = len(locations)
-
         j = 0
         while j < size_list-1:
             i = j + 1
             while i < size_list:
-                #print("Vergleich \n")
-                #print(str(locations[j]) + "\n")
-                #print(str(locations[i]) + "\n")
                 if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
-                    #merge overlapping regions
+                    #merge overlapping regions plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                    #print("M+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
-                    #merge overlapping regions
+                    #merge overlapping regions minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
-                    #print("M-")
                 elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
-                    #print(j)
+                    #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
-                    #print("Insert+")
                 elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
-                    #print(j)
+                    #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
-                    #print("Insert-")
-
                 i += 1
             j += 1
 
         number_regions += len(locations)
         blast_results[key] = locations
 
-    #print(blast_results)
     return blast_results, number_regions
 
 def parse_blast(line, blast_results, cutoff):
@@ -80,7 +71,6 @@ def parse_blast(line, blast_results, cutoff):
     # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
-    #print(line_info)
     evalue = float(line_info[3])
     #cut off
     if evalue > cutoff:
@@ -127,12 +117,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         return 0,0
     else:
         candidate_regions, number_regions = merge(blast_results, intron_length)
-        #print(candidate_regions)
-        #print(candidate_regions, number_regions)
+
         return candidate_regions, number_regions
 
 def extract_seq(region_dic, path, tmp_path):
-    #print(region_dic)
+
     for key in region_dic:
         #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
         cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
@@ -145,17 +134,18 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
         locations = regions[key]
         counter = 0
         for i in locations:
+            # some variables
             counter += 1
             start = str(i[0] - length_extension)
             end = str(i[1] + length_extension)
             name = key + "_" + str(counter)
-            #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff")
-
+            # augutus call
             cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
-            result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+            # transfer augustus output to as sequence
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
             result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
-
+            # parsing header and sequences
             try:
                 sequence_file = open(tmp_path + name + ".aa", "r")
                 lines = sequence_file.readlines()
@@ -168,20 +158,15 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                         output.write(line)
                 sequence_file.close()
             except FileNotFoundError:
-                print("No gene found by ID:" + name +" , continuing with next region")
-
-
-
+                print("No gene found in region with ID:" + name + " , continuing with next region")
     output.close()
 
 def searching_for_db(assembly_path):
-    #print("test: " + str(assembly_path) + "\n")
+
     db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto']
     check = True
     for end in db_endings:
-        #print(assembly_path + end + "\n")
         check = check and os.path.exists(assembly_path + end)
-        #print(check)
     return check
 
 def get_distance_biopython(file, matrix):
@@ -240,8 +225,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         #print("mafft-linsi")
         os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
 
-    #d_ref = get_distance(aln_file, best_hit, ref)
-    #d = get_distance(aln_file, best_hit, candidate_name)
     distances = get_distance_biopython(aln_file, matrix)
 
     distance_hit_query = distances[best_hit, candidate_name]
@@ -390,9 +373,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     return list(orthologs), seed
 
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
-    #print("addSequences")
-    #print(sequenceIds)
-    #print(species_list)
+
     output_file = open(output, "a+")
     if refBool == False:
         seq_records_core = readFasta(core_fasta)
@@ -406,8 +387,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
     seq_records_candidate = readFasta(candidate_fasta)
     seq_records_candidate = list(seq_records_candidate)
     for entry_candidate in seq_records_candidate:
-        #print(entry_candidate.id)
-        #print(sequenceIds)
         if entry_candidate.id in sequenceIds:
             output_file.write(">" + entry_candidate.id + "\n")
             output_file.write(str(entry_candidate.seq) + "\n")
@@ -455,7 +434,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             f.write(str(record.seq) +  "\n")
             break
 
-
     for record in candidates:
         for name in candidate_names:
             if name in record.id:
@@ -465,9 +443,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     if msaTool == "muscle":
         os.system("muscle -quiet -in " + out + " -out " + aln_file)
-        #print("muscle -quiet -in " + output_file + " -out " + aln_file)
     elif msaTool == "mafft-linsi":
-        #print("mafft-linsi")
         os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file)
 
     distances = get_distance_biopython(aln_file, matrix)
@@ -481,9 +457,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             min_dist = distance
             min_name = name
 
-
-
-
     for name in candidate_names:
         if distances[min_name , name] < distances[min_name , ref_id]:
             checked.append(name)
@@ -577,18 +550,6 @@ def main():
     searchTaxon = args.searchTaxon
     silent = args.silent
 
-    ###################### How to handling std output ##########################
-    # if silent == True:
-    #     print(out + "fdog.log \n")
-    #     f = open(out + "fdog.log", "a+")
-    #     sys.stdout = f
-    # else:
-    #     print(out + "fdog.log \n")
-    #     sys.stdout = Logger(out)
-
-
-
-
     #checking paths
     if dataPath == '':
         fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','')
@@ -618,31 +579,25 @@ def main():
         os.system('mkdir ' + out + '/' + group)
         out = out + '/' + group + '/'
 
-
-    #print(assemblyDir)
     try:
         f = open(out + "/fdog.log", "a+")
     except FileNotFoundError:
         f = open(out + "/fdog.log", "w")
 
+    ################## How to handle std output and std error ##################
 
     if silent == True:
         sys.stderr = f
         sys.stdout = f
     else:
         sys.stdout = Logger(f)
-
-
-
     # user input has to be checked here before fDOGassembly continues
-
     assembly_names = os.listdir(assemblyDir)
 
-
-
     ########################## some variables ##################################
 
     refBool = False # checks if sequences of reference species were already part of the extended.fa file
+
     ########### paths ###########
 
     msa_path = core_path + "/" + group +"/"+ group + ".aln"
@@ -658,18 +613,16 @@ def main():
     ######################## consensus sequence ################################
 
     #make a majority-rule consensus sequence with the tool hmmemit from hmmer
-    print("Building a consensus sequence \n")
+    print("Building a consensus sequence for gene " + group + " \n")
     os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path)
     print("consensus sequence is finished\n")
 
     ######################## block profile #####################################
 
-    print("Building a block profile \n")
+    print("Building a block profile for gene " + group + " \n")
     cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
-    #os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path)
     result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
 
-    #print(os.path.getsize(profile_path))
     if int(os.path.getsize(profile_path)) > 0:
         print("block profile is finished \n")
     else:
@@ -683,9 +636,9 @@ def main():
         result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
         print("block profile is finished \n")
 
-
     searchBool = False
 
+    #################### fDOG assembly computation for all species #############
     for asName in assembly_names:
         if searchBool == True:
             break
@@ -694,6 +647,7 @@ def main():
             searchBool = True
 
         ################### path definitions ###################################
+
         os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1')
         tmp_path = out + "/tmp/" + asName + "/"
         candidatesOutFile = tmp_path + group + ".candidates.fa"
@@ -706,16 +660,13 @@ def main():
             fasOutFile = out + "/" + group
             mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-
         print("Searching in species " + asName + "\n")
         assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
         db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-    ######################## tBLASTn ###########################################
-
-    #database anlegen
 
+    ######################## tBLASTn ###########################################
+        #checks if data base exists already
         db_check = searching_for_db(db_path)
-        #print(assembly_path)
         if db_check == 0:
             print("creating a blast data base \n")
             os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path)
@@ -723,16 +674,13 @@ def main():
         else:
             print('blast data base exists already, continuing...')
 
-
-    #make a tBLASTn search against the new database
-    #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-
+        #makes a tBLASTn search against the new database
+        #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
         print("tBLASTn search against data base")
         os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out')
         print("tBLASTn search is finished")
 
     ################### search for candidate regions and extract seq ###########
-
     # parse blast and filter for candiate regions
         regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
 
@@ -740,20 +688,20 @@ def main():
             #no candidat region are available, no ortholog can be found
             print("No candidate region found")
             continue
-
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
             extract_seq(regions, db_path, tmp_path)
 
     ############### make Augustus PPX search ###################################
+
         print("starting augustus ppx \n")
         augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path)
         print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
+
         reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path)
 
-        print(reciprocal_sequences)
         if reciprocal_sequences == 0:
             print("No ortholog fulfilled the reciprocity criteria")
             if searchTaxon == '':
@@ -762,35 +710,34 @@ def main():
                 cleanup(tmp, tmp_path)
                 return 1
 
-    ################## checking accepted genes for co-orthologs ##########################
-        reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
-        print(reciprocal_sequences)
+    ################## checking accepted genes for co-orthologs ################
 
+        reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
     ################ add sequences to extended.fa in the output folder##########
+
         addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
         refBool = True
 
     ############### make Annotation with FAS ###################################
+        # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
             os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1')
             os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName )
-
-
+    #if we searched in more than one Taxon and no ortholog was found
     if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
         cleanup(tmp, tmp_path)
         return 1
-
+    #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group )
 
-
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)
@@ -799,6 +746,5 @@ def main():
 
     f.close()
 
-
 if __name__ == '__main__':
     main()

From 6546b530bc1209d940d50916667ef3ae213a6595 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Apr 2021 09:42:31 +0200
Subject: [PATCH 049/192] clean up code

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 44e7607..bc8eb54 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -576,7 +576,7 @@ def main():
     if out == '':
         #print('test out \n')
         out = os.getcwd()
-        os.system('mkdir ' + out + '/' + group)
+        os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1')
         out = out + '/' + group + '/'
 
     try:

From 583536554383b3222ce0a01eee343571d234cbec Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Apr 2021 15:01:48 +0200
Subject: [PATCH 050/192] clean up

---
 .DS_Store            | Bin 6148 -> 6148 bytes
 fdog/fDOGassembly.py |   4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index fa2521e2436140a5f3689d5732ee4d25d777342f..ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab 100644
GIT binary patch
delta 140
zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7iv?Ji7&#{MFxfMnnC!#Uz#?8<Z7}%~lQQFs
z$={gd8D~xwV}7X1$56tM$WXzM!cfYP%TU0e2j&+6>2!uF&z$_^q@4UD1_lNJ1_q||
m%`cdrGHzz);O79^wV9FSJM(0I5l0S2pd3&M!{!K)HOv4JL?SK#

delta 118
zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3P4;1FV3Vw_HZ(Aqe1Sz-aTWs@
zFfu}D27V|Fqh?PQVSTt+j6;BBGdl-A2hh~bf*jwOC-aLqa)8tT^|5S@5Lv?v0F#at
Ad;kCd

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bc8eb54..1c2f21a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -576,7 +576,7 @@ def main():
     if out == '':
         #print('test out \n')
         out = os.getcwd()
-        os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1')
+        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
         out = out + '/' + group + '/'
 
     try:
@@ -608,7 +608,7 @@ def main():
 
     ###################### create tmp folder ###################################
 
-    os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1')
+    os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1')
 
     ######################## consensus sequence ################################
 

From 421580d7895fb76f32ed79820b9d652516af7bf3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Apr 2021 15:12:24 +0200
Subject: [PATCH 051/192] clean up

---
 fdog/fDOGassembly.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1c2f21a..2c57503 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -125,7 +125,7 @@ def extract_seq(region_dic, path, tmp_path):
     for key in region_dic:
         #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
         cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
-        result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+        result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
 
 def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path):
     output = open(candidatesOutFile, "w")
@@ -457,8 +457,10 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             min_dist = distance
             min_name = name
 
+    checked = []
+
     for name in candidate_names:
-        if distances[min_name , name] < distances[min_name , ref_id]:
+        if distances[min_name , name] <= distances[min_name , ref_id]:
             checked.append(name)
 
     return checked

From 89dfaf0290ada42714b958057d16e537570b5beb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Apr 2021 15:20:06 +0200
Subject: [PATCH 052/192] reduce output

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 2c57503..b2d2afa 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -144,7 +144,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
             # transfer augustus output to as sequence
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
-            result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+            result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
             # parsing header and sequences
             try:
                 sequence_file = open(tmp_path + name + ".aa", "r")

From ecf29edbc63829f9ee2cfedd872f2b5f4d857c67 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 22 Apr 2021 11:34:21 +0200
Subject: [PATCH 053/192] clean up code

---
 fdog/fDOGassembly.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b2d2afa..03f998a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -724,10 +724,12 @@ def main():
     ############### make Annotation with FAS ###################################
         # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
+            print("Calculating FAS scores")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
             os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1')
-            os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName )
+            cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
     #if we searched in more than one Taxon and no ortholog was found
     if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
@@ -735,10 +737,12 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
+        print("Calculating FAS scores")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-        os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group )
+        cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
 
     ################# remove tmp folder ########################################
     if searchTaxon != '':

From 55a9e6c2ce2fabc8d2872371e6bbe0dc7599407a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sat, 24 Apr 2021 11:05:32 +0200
Subject: [PATCH 054/192] check augustus

---
 fdog/fDOGassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 03f998a..b028245 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -141,7 +141,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             name = key + "_" + str(counter)
             # augutus call
             cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
-            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+            #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+            result = subprocess.run(cmd, shell=True)
             # transfer augustus output to as sequence
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
             result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)

From d2492d036e66777104e1277f2035eebee6960f65 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sat, 24 Apr 2021 11:12:34 +0200
Subject: [PATCH 055/192] testing

---
 fdog/fDOGassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b028245..5e85998 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -125,7 +125,8 @@ def extract_seq(region_dic, path, tmp_path):
     for key in region_dic:
         #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
         cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
-        result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        result = subprocess.run(cmd, shell=True)
 
 def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path):
     output = open(candidatesOutFile, "w")

From 2c0d152f76f9d1540e273822417f2ef9c224abaa Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 21:20:59 +0200
Subject: [PATCH 056/192] adding option to recognize if co-ortholog or not in
 header of the extended.fa

---
 fdog/fDOGassembly.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 5e85998..bc3a290 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -390,8 +390,12 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
     seq_records_candidate = list(seq_records_candidate)
     for entry_candidate in seq_records_candidate:
         if entry_candidate.id in sequenceIds:
-            output_file.write(">" + entry_candidate.id + "\n")
-            output_file.write(str(entry_candidate.seq) + "\n")
+            if entry_candidate == sequenceIds[0]:
+                output_file.write(">" + entry_candidate.id + "|1" + "\n")
+                output_file.write(str(entry_candidate.seq) + "\n")
+            else:
+                output_file.write(">" + entry_candidate.id + "|0" + "\n")
+                output_file.write(str(entry_candidate.seq) + "\n")
     output_file.close()
     return 0
 
@@ -459,10 +463,12 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             min_dist = distance
             min_name = name
 
-    checked = []
+    checked = [min_name]
 
     for name in candidate_names:
-        if distances[min_name , name] <= distances[min_name , ref_id]:
+        if name == min_name:
+            pass
+        elif distances[min_name , name] <= distances[min_name , ref_id]:
             checked.append(name)
 
     return checked

From 4b19832344ea880614875e6923f9f793b2202f87 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 21:25:54 +0200
Subject: [PATCH 057/192] testing

---
 fdog/fDOGassembly.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bc3a290..6d5059f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -390,7 +390,8 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
     seq_records_candidate = list(seq_records_candidate)
     for entry_candidate in seq_records_candidate:
         if entry_candidate.id in sequenceIds:
-            if entry_candidate == sequenceIds[0]:
+            if entry_candidate.id == sequenceIds[0]:
+                print(entry_candidate.id)
                 output_file.write(">" + entry_candidate.id + "|1" + "\n")
                 output_file.write(str(entry_candidate.seq) + "\n")
             else:
@@ -751,7 +752,7 @@ def main():
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-
+        print(cmd)
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From db4c6a57fff0939bbae951a9c0ae3b1dc3c3384e Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 21:34:36 +0200
Subject: [PATCH 058/192] testing

---
 fdog/fDOGassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 6d5059f..2f780c5 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
         for species in species_list:
             for entry_core in seq_records_core:
                 if species in entry_core.id:
-                    output_file.write(">" + entry_core.id + "\n")
+                    output_file.write(">" + entry_core.id + "|1" + "\n")
                     output_file.write(str(entry_core.seq) + "\n")
 
     seq_records_candidate = readFasta(candidate_fasta)
@@ -403,6 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
 def createFasInput(orthologsOutFile, mappingFile):
     with open(orthologsOutFile, "r") as f:
         fas_seed_id = (f.readline())[1:-1]
+        fas_seed_id = fas_seed_id.split("|")[0]
 
     mappingFile = open(mappingFile, "a+")
 

From f4871452939fa6b9952f1293b46c2aa3b2376464 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 21:54:12 +0200
Subject: [PATCH 059/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 2f780c5..9ea9837 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -752,7 +752,7 @@ def main():
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+        result = subprocess.run(cmd, shell=True)
         print(cmd)
     ################# remove tmp folder ########################################
     if searchTaxon != '':

From 43b73b0a63bea0b3b72557ec19fc1fe9b7ed2574 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 22:02:08 +0200
Subject: [PATCH 060/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 9ea9837..c549076 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
 def createFasInput(orthologsOutFile, mappingFile):
     with open(orthologsOutFile, "r") as f:
         fas_seed_id = (f.readline())[1:-1]
-        fas_seed_id = fas_seed_id.split("|")[0]
+        #fas_seed_id = fas_seed_id.split("|")[0]
 
     mappingFile = open(mappingFile, "a+")
 

From 620d5fa9cf37883ccd9e14556af6513e993559d5 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 22:34:04 +0200
Subject: [PATCH 061/192] testing

---
 fdog/fDOGassembly.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bc3a290..d13cbc8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -383,14 +383,15 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
         for species in species_list:
             for entry_core in seq_records_core:
                 if species in entry_core.id:
-                    output_file.write(">" + entry_core.id + "\n")
+                    output_file.write(">" + entry_core.id + "|1" + "\n")
                     output_file.write(str(entry_core.seq) + "\n")
 
     seq_records_candidate = readFasta(candidate_fasta)
     seq_records_candidate = list(seq_records_candidate)
     for entry_candidate in seq_records_candidate:
         if entry_candidate.id in sequenceIds:
-            if entry_candidate == sequenceIds[0]:
+            if entry_candidate.id == sequenceIds[0]:
+                print(entry_candidate.id)
                 output_file.write(">" + entry_candidate.id + "|1" + "\n")
                 output_file.write(str(entry_candidate.seq) + "\n")
             else:
@@ -750,8 +751,8 @@ def main():
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-
+        result = subprocess.run(cmd, shell=True)
+        print(cmd)
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From ac3477362a0e7339dbc6de19460e79cc26d8ce58 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 22:41:07 +0200
Subject: [PATCH 062/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c549076..d20968e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
         for species in species_list:
             for entry_core in seq_records_core:
                 if species in entry_core.id:
-                    output_file.write(">" + entry_core.id + "|1" + "\n")
+                    output_file.write(">" + entry_core.id + "\n")
                     output_file.write(str(entry_core.seq) + "\n")
 
     seq_records_candidate = readFasta(candidate_fasta)

From 86337fcb7b7884c0865bef1b56bd3f1daf26385a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 22:42:09 +0200
Subject: [PATCH 063/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d20968e..e8100ec 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
 def createFasInput(orthologsOutFile, mappingFile):
     with open(orthologsOutFile, "r") as f:
         fas_seed_id = (f.readline())[1:-1]
-        #fas_seed_id = fas_seed_id.split("|")[0]
+        fas_seed_id = fas_seed_id.split("|")[0]
 
     mappingFile = open(mappingFile, "a+")
 

From 507238052124d6ea6e0c4f45594ff51d741a1614 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 25 Apr 2021 22:47:37 +0200
Subject: [PATCH 064/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e8100ec..d20968e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
 def createFasInput(orthologsOutFile, mappingFile):
     with open(orthologsOutFile, "r") as f:
         fas_seed_id = (f.readline())[1:-1]
-        fas_seed_id = fas_seed_id.split("|")[0]
+        #fas_seed_id = fas_seed_id.split("|")[0]
 
     mappingFile = open(mappingFile, "a+")
 

From df6d32467000ee0c350e313e68d118d2bbfcf90d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 10:54:45 +0200
Subject: [PATCH 065/192] added function starting_subprocess() to handle call
 of extern tools more easily

---
 fdog/fDOGassembly.py | 69 ++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d20968e..a7c24ed 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -16,6 +16,14 @@ def load_config(config_file):
         except yaml.YAMLError as exc:
             print(exc)
 
+def starting_subprocess(cmd, mode):
+    if mode == 'debug':
+        result = subprocess.run(cmd, shell=True)
+    elif mode == 'silent':
+        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+    elif mode == 'normal':
+        result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+
 def merge(blast_results, insert_length):
     #merging overlapping and contigous candidate regions
     number_regions = 0
@@ -120,15 +128,14 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
 
         return candidate_regions, number_regions
 
-def extract_seq(region_dic, path, tmp_path):
+def extract_seq(region_dic, path, tmp_path, mode):
 
     for key in region_dic:
         #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
         cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
-        #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
-        result = subprocess.run(cmd, shell=True)
+        starting_subprocess(cmd, mode)
 
-def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path):
+def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode):
     output = open(candidatesOutFile, "w")
 
     for key in regions:
@@ -143,10 +150,10 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             # augutus call
             cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
             #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-            result = subprocess.run(cmd, shell=True)
+            starting_subprocess(cmd, mode)
             # transfer augustus output to as sequence
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
-            result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+            starting_subprocess(cmd, mode)
             # parsing header and sequences
             try:
                 sequence_file = open(tmp_path + name + ".aa", "r")
@@ -524,6 +531,8 @@ def main():
     optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
     optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
+    optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
+
 
     args = parser.parse_args()
 
@@ -561,6 +570,18 @@ def main():
     fasoff = args.fasoff
     searchTaxon = args.searchTaxon
     silent = args.silent
+    debug = args.debug
+
+    if debug == True and silent == True:
+        print("It's not possible to use booth modes, please restart and use --debug or --silent")
+        return 1
+    else:
+        if debug == True:
+            mode = 'debug'
+        elif silent == True:
+            mode = 'silent'
+        else:
+            mode = 'normal'
 
     #checking paths
     if dataPath == '':
@@ -598,11 +619,12 @@ def main():
 
     ################## How to handle std output and std error ##################
 
-    if silent == True:
+    if mode == 'silent':
         sys.stderr = f
         sys.stdout = f
     else:
         sys.stdout = Logger(f)
+
     # user input has to be checked here before fDOGassembly continues
     assembly_names = os.listdir(assemblyDir)
 
@@ -620,20 +642,22 @@ def main():
 
     ###################### create tmp folder ###################################
 
-    os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1')
+    cmd = 'mkdir ' + out + '/tmp'
+    starting_subprocess(cmd, 'silent')
 
     ######################## consensus sequence ################################
 
     #make a majority-rule consensus sequence with the tool hmmemit from hmmer
     print("Building a consensus sequence for gene " + group + " \n")
-    os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path)
+    cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
+    starting_subprocess(cmd, mode)
     print("consensus sequence is finished\n")
 
     ######################## block profile #####################################
 
     print("Building a block profile for gene " + group + " \n")
     cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
-    result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+    starting_subprocess(cmd, mode)
 
     if int(os.path.getsize(profile_path)) > 0:
         print("block profile is finished \n")
@@ -642,10 +666,10 @@ def main():
         new_path = core_path + group +"/"+ group + "_new.aln"
         #print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
-        result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+        starting_subprocess(cmd, mode)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
         #print(cmd)
-        result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+        starting_subprocess(cmd, mode)
         print("block profile is finished \n")
 
     searchBool = False
@@ -660,7 +684,8 @@ def main():
 
         ################### path definitions ###################################
 
-        os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1')
+        cmd = 'mkdir ' + out + '/tmp/' + asName
+        starting_subprocess(cmd, 'silent')
         tmp_path = out + "/tmp/" + asName + "/"
         candidatesOutFile = tmp_path + group + ".candidates.fa"
         if searchTaxon != '':
@@ -681,7 +706,8 @@ def main():
         db_check = searching_for_db(db_path)
         if db_check == 0:
             print("creating a blast data base \n")
-            os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path)
+            cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+            starting_subprocess(cmd, mode)
             print("database is finished \n")
         else:
             print('blast data base exists already, continuing...')
@@ -689,7 +715,8 @@ def main():
         #makes a tBLASTn search against the new database
         #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
         print("tBLASTn search against data base")
-        os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out')
+        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+        starting_subprocess(cmd, mode)
         print("tBLASTn search is finished")
 
     ################### search for candidate regions and extract seq ###########
@@ -702,12 +729,12 @@ def main():
             continue
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
-            extract_seq(regions, db_path, tmp_path)
+            extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
 
         print("starting augustus ppx \n")
-        augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path)
+        augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
         print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
@@ -737,9 +764,10 @@ def main():
             print("Calculating FAS scores")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-            os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1')
+            cmd = 'mkdir ' + tmp_path + 'anno_dir'
+            starting_subprocess(cmd, 'silent')
             cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
-            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+            starting_subprocess(cmd, mode)
     #if we searched in more than one Taxon and no ortholog was found
     if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
@@ -752,8 +780,7 @@ def main():
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
-        result = subprocess.run(cmd, shell=True)
-        print(cmd)
+        starting_subprocess(cmd, mode)
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From 7187972986ee69a27b472104d981455498c208bb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 10:55:29 +0200
Subject: [PATCH 066/192] added augustus to dependencies

---
 .DS_Store                 | Bin 6148 -> 6148 bytes
 fdog/setup/setup_conda.sh |   3 +++
 2 files changed, 3 insertions(+)

diff --git a/.DS_Store b/.DS_Store
index ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab..824f712743a6414728f27d69a840e656771a9cdf 100644
GIT binary patch
delta 68
zcmZoMXffE}&BAzUvJXoGn`Cvhp@GTd3oOctvlzgDkr6^O@Iz@BHG8uN>qn-|>>Pjj
E0r)Enz5oCK

delta 40
wcmZoMXffE}&BAzMvJXoGi+FXl!Q@LU%8WB6e`AqnoVi(y^&`_}R*wJt02)~h&j0`b

diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index fae81b7..ddc4e23 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -116,6 +116,7 @@ dependencies=(
   mafft # for linsi
   muscle
   fasta36
+  augustus #for fdog.assembly
 )
 
 for i in "${dependencies[@]}"; do
@@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do
       fi
     elif [ "$tool" = "fasta36" ]; then
       conda install -y -c bioconda fasta3
+    elif [ "$tool" = "augustus" ]; then
+      conda install -y -c bioconda augustus
     else
       conda install -y -c bioconda $i
     fi

From 721bcdbaa9c0db7055c9bd3e4c0001cd613ea045 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 12:55:26 +0200
Subject: [PATCH 067/192] testing

---
 fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a7c24ed..bdaf93b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -247,7 +247,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         #rejected
         return 0, distance_ref_hit, distance_hit_query
 
-def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path):
+def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode):
     # the backward search uses the genes predicted from augustus and makes a blastp search
     #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa
     seedDic = getSeedInfo(fasta_path)
@@ -263,7 +263,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
             print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
             return 0, seed
         if searchTool == "blast":
-            os.system("blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile)
+            cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+            starting_subprocess(cmd, mode)
         else:
             print("diamonds are the girls best friends")
             ##### diamond call
@@ -348,7 +349,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
                 print("The species " + species + " isn't part of the core ortholog group, ... exciting")
                 return 0, seed
 
-            os.system("blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile)
+            cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+            starting_subprocess(cmd, mode)
             alg_file = open(tmp_path + "/blast_" + species, "r")
             lines = alg_file.readlines()
             alg_file.close()
@@ -393,17 +395,18 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
                     output_file.write(">" + entry_core.id + "\n")
                     output_file.write(str(entry_core.seq) + "\n")
 
-    seq_records_candidate = readFasta(candidate_fasta)
-    seq_records_candidate = list(seq_records_candidate)
-    for entry_candidate in seq_records_candidate:
-        if entry_candidate.id in sequenceIds:
-            if entry_candidate.id == sequenceIds[0]:
-                print(entry_candidate.id)
-                output_file.write(">" + entry_candidate.id + "|1" + "\n")
-                output_file.write(str(entry_candidate.seq) + "\n")
-            else:
-                output_file.write(">" + entry_candidate.id + "|0" + "\n")
-                output_file.write(str(entry_candidate.seq) + "\n")
+    if sequenceIds != 0:
+        seq_records_candidate = readFasta(candidate_fasta)
+        seq_records_candidate = list(seq_records_candidate)
+        for entry_candidate in seq_records_candidate:
+            if entry_candidate.id in sequenceIds:
+                if entry_candidate.id == sequenceIds[0]:
+                    print(entry_candidate.id)
+                    output_file.write(">" + entry_candidate.id + "|1" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+                else:
+                    output_file.write(">" + entry_candidate.id + "|0" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
     output_file.close()
     return 0
 
@@ -738,16 +741,24 @@ def main():
         print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
+        if int(os.path.getsize(candidatesOutFile)) > 0:
+            print("No genes found at candidate regions\n")
+            if searchTaxon == '':
+                continue
+            else:
+                addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
+                return 0
 
-        reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path)
+        reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
         if reciprocal_sequences == 0:
             print("No ortholog fulfilled the reciprocity criteria")
             if searchTaxon == '':
                 continue
             else:
+                addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
                 cleanup(tmp, tmp_path)
-                return 1
+                return 0
 
     ################## checking accepted genes for co-orthologs ################
 

From 9a2e4d00a97cff812e623b4bf219e581ae08922b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 13:06:34 +0200
Subject: [PATCH 068/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bdaf93b..c22e515 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -741,7 +741,7 @@ def main():
         print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
-        if int(os.path.getsize(candidatesOutFile)) > 0:
+        if int(os.path.getsize(candidatesOutFile)) <= 0:
             print("No genes found at candidate regions\n")
             if searchTaxon == '':
                 continue

From 1e5893b85c169899ed0ace275dcb3ff89ee5cdef Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 13:30:51 +0200
Subject: [PATCH 069/192] testing

---
 fdog/fDOGassembly.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c22e515..e52b8a4 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -22,7 +22,7 @@ def starting_subprocess(cmd, mode):
     elif mode == 'silent':
         result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
     elif mode == 'normal':
-        result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+        result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
 
 def merge(blast_results, insert_length):
     #merging overlapping and contigous candidate regions
@@ -485,6 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     return checked
 
+def changes_for_fas(file, header, mode):
+    #def replace_first_line( src_filename, target_filename, replacement_line):
+    f_in = open(file)
+    first_line, remainder = f.readline(), f.read()
+    line = first_line.split("|")[0]
+    f_in.close()
+    f_out = open(file + "s","w")
+    f_out.write(line + "\n")
+    f_out.write(remainder)
+    f_out.close()
+
 class Logger(object):
     def __init__(self, file):
         self.file = file
@@ -746,23 +757,20 @@ def main():
             if searchTaxon == '':
                 continue
             else:
-                addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
-                return 0
+                reciprocal_sequences = 0
+        else:
+            reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
-        reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
+    ################## checking accepted genes for co-orthologs ################
         if reciprocal_sequences == 0:
             print("No ortholog fulfilled the reciprocity criteria")
             if searchTaxon == '':
                 continue
             else:
-                addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
-                cleanup(tmp, tmp_path)
-                return 0
-
-    ################## checking accepted genes for co-orthologs ################
-
-        reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+                reciprocal_sequences = 0
+        else:
+            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
     ################ add sequences to extended.fa in the output folder##########
 

From e8440c86fcec447a0ff1d98ffd3d1940139a69bb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 13:42:49 +0200
Subject: [PATCH 070/192] testing

---
 fdog/fDOGassembly.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e52b8a4..688a000 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -754,7 +754,7 @@ def main():
     ################# backward search to filter for orthologs###################
         if int(os.path.getsize(candidatesOutFile)) <= 0:
             print("No genes found at candidate regions\n")
-            if searchTaxon == '':
+            if searchTaxon == '' and refBool == True:
                 continue
             else:
                 reciprocal_sequences = 0
@@ -765,7 +765,7 @@ def main():
     ################## checking accepted genes for co-orthologs ################
         if reciprocal_sequences == 0:
             print("No ortholog fulfilled the reciprocity criteria")
-            if searchTaxon == '':
+            if searchTaxon == '' and refBool == True:
                 continue
             else:
                 reciprocal_sequences = 0
@@ -788,6 +788,7 @@ def main():
             cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
             starting_subprocess(cmd, mode)
     #if we searched in more than one Taxon and no ortholog was found
+
     if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
         cleanup(tmp, tmp_path)

From 6362e47e45034fd026cfbc2e3319c3266a2c9d65 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 13:52:41 +0200
Subject: [PATCH 071/192] testing

---
 fdog/fDOGassembly.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 688a000..08cdfaa 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -739,21 +739,23 @@ def main():
 
         if regions == 0:
             #no candidat region are available, no ortholog can be found
-            print("No candidate region found")
-            continue
+            if refBool == True:
+                print("No candidate region found")
+                continue
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
             extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
 
-        print("starting augustus ppx \n")
-        augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-        print("augustus is finished \n")
+            print("starting augustus ppx \n")
+            augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+            print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
-        if int(os.path.getsize(candidatesOutFile)) <= 0:
-            print("No genes found at candidate regions\n")
+        if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0:
+            if regions != 0:
+                print("No genes found at candidate regions\n")
             if searchTaxon == '' and refBool == True:
                 continue
             else:

From 02ad76cd791f0c7d202f443ca2a0665a13271c3a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 13:58:03 +0200
Subject: [PATCH 072/192] testing

---
 fdog/fDOGassembly.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 08cdfaa..02f627f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -753,15 +753,14 @@ def main():
             print("augustus is finished \n")
 
     ################# backward search to filter for orthologs###################
-        if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0:
-            if regions != 0:
+            if int(os.path.getsize(candidatesOutFile)) <= 0:
                 print("No genes found at candidate regions\n")
-            if searchTaxon == '' and refBool == True:
-                continue
+                if searchTaxon == '' and refBool == True:
+                    continue
+                else:
+                    reciprocal_sequences = 0
             else:
-                reciprocal_sequences = 0
-        else:
-            reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+                reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
 
     ################## checking accepted genes for co-orthologs ################
@@ -772,7 +771,10 @@ def main():
             else:
                 reciprocal_sequences = 0
         else:
-            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+            if regions != 0
+                reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+            else:
+                reciprocal_sequences = 0
 
     ################ add sequences to extended.fa in the output folder##########
 

From ac929b7f87c55870f83cb2201d1bad8e4a2d56c2 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 14:01:00 +0200
Subject: [PATCH 073/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 02f627f..c98f6a7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -771,7 +771,7 @@ def main():
             else:
                 reciprocal_sequences = 0
         else:
-            if regions != 0
+            if regions != 0:
                 reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
             else:
                 reciprocal_sequences = 0

From 060b4bb10297df20b627a6b71324c4926eef616a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 14:03:49 +0200
Subject: [PATCH 074/192] testing

---
 fdog/fDOGassembly.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c98f6a7..524b83f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -739,6 +739,7 @@ def main():
 
         if regions == 0:
             #no candidat region are available, no ortholog can be found
+            reciprocal_sequences = 0
             if refBool == True:
                 print("No candidate region found")
                 continue
@@ -771,10 +772,7 @@ def main():
             else:
                 reciprocal_sequences = 0
         else:
-            if regions != 0:
-                reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
-            else:
-                reciprocal_sequences = 0
+            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
     ################ add sequences to extended.fa in the output folder##########
 

From c996ca6287c601856bce1ab849bcd4bdaf9f86bf Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 14:10:07 +0200
Subject: [PATCH 075/192] testing

---
 fdog/fDOGassembly.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 524b83f..07dbe83 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -384,6 +384,10 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     return list(orthologs), seed
 
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
+    print(output)
+    print(refBool)
+    print(core_fasta)
+    print(species_list)
 
     output_file = open(output, "a+")
     if refBool == False:
@@ -739,10 +743,11 @@ def main():
 
         if regions == 0:
             #no candidat region are available, no ortholog can be found
-            reciprocal_sequences = 0
+            print("No candidate region found")
             if refBool == True:
-                print("No candidate region found")
                 continue
+            else:
+                reciprocal_sequences = 0
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
             extract_seq(regions, db_path, tmp_path, mode)
@@ -766,7 +771,8 @@ def main():
 
     ################## checking accepted genes for co-orthologs ################
         if reciprocal_sequences == 0:
-            print("No ortholog fulfilled the reciprocity criteria")
+            if regions != 0:
+                print("No ortholog fulfilled the reciprocity criteria")
             if searchTaxon == '' and refBool == True:
                 continue
             else:

From 3f46b83ad88816c741779f6a378e5f4ace1a6a11 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 14:15:41 +0200
Subject: [PATCH 076/192] testing

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 07dbe83..09ac05e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -747,6 +747,7 @@ def main():
             if refBool == True:
                 continue
             else:
+                taxa = fdog_ref_species
                 reciprocal_sequences = 0
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
@@ -765,6 +766,7 @@ def main():
                     continue
                 else:
                     reciprocal_sequences = 0
+                    taxa = fdog_ref_species
             else:
                 reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 

From b5924a81f6784730b6863c298025aafee79614ae Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 14:21:39 +0200
Subject: [PATCH 077/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 09ac05e..ca89dd1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -747,7 +747,7 @@ def main():
             if refBool == True:
                 continue
             else:
-                taxa = fdog_ref_species
+                taxa = [fdog_ref_species]
                 reciprocal_sequences = 0
         else:
             print(str(number_regions) + " candiate regions were found. Extracting sequences...")
@@ -766,7 +766,7 @@ def main():
                     continue
                 else:
                     reciprocal_sequences = 0
-                    taxa = fdog_ref_species
+                    taxa = [fdog_ref_species]
             else:
                 reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 

From 490f43cc42b3e8122441f12dcded7cb8f1a26a7b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 15:05:58 +0200
Subject: [PATCH 078/192] added function to clean up .domain files

---
 fdog/fDOGassembly.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ca89dd1..3c837dd 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -384,10 +384,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     return list(orthologs), seed
 
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
-    print(output)
-    print(refBool)
-    print(core_fasta)
-    print(species_list)
 
     output_file = open(output, "a+")
     if refBool == False:
@@ -489,16 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     return checked
 
-def changes_for_fas(file, header, mode):
-    #def replace_first_line( src_filename, target_filename, replacement_line):
-    f_in = open(file)
-    first_line, remainder = f.readline(), f.read()
-    line = first_line.split("|")[0]
-    f_in.close()
-    f_out = open(file + "s","w")
-    f_out.write(line + "\n")
-    f_out.write(remainder)
-    f_out.close()
+def clean_fas(path):
+    file = open(path, "r")
+    lines = file.readlines()
+    file.close()
+    file.open(path,"w")
+
+    for line in lines:
+        long_id, remain = line.split("#")
+        id = long_id.split("|")[0]
+        new_line = id + "#" + remain
+        file.write(new_line)
 
 class Logger(object):
     def __init__(self, file):
@@ -811,6 +808,8 @@ def main():
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, mode)
+        clean_fas(group + "_forward.domains")
+        clean_fas(group + "_reverse.domains")
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From 07c693d795385bfd0d1941271e8228aa6c71c240 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 15:15:11 +0200
Subject: [PATCH 079/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3c837dd..d50bfe8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -808,8 +808,8 @@ def main():
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, mode)
-        clean_fas(group + "_forward.domains")
-        clean_fas(group + "_reverse.domains")
+        clean_fas(out + group + "_forward.domains")
+        clean_fas(out + group + "_reverse.domains")
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From 3d804229698eb08161c2edd537dec774f6470a70 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 15:25:05 +0200
Subject: [PATCH 080/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d50bfe8..75e10f1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -489,7 +489,7 @@ def clean_fas(path):
     file = open(path, "r")
     lines = file.readlines()
     file.close()
-    file.open(path,"w")
+    file = open(path,"w")
 
     for line in lines:
         long_id, remain = line.split("#")

From acdb6fe068a7d221d780d651660d6da6c45a830c Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 15:47:07 +0200
Subject: [PATCH 081/192] testing

---
 fdog/fDOGassembly.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 75e10f1..23359d3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -401,7 +401,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
         for entry_candidate in seq_records_candidate:
             if entry_candidate.id in sequenceIds:
                 if entry_candidate.id == sequenceIds[0]:
-                    print(entry_candidate.id)
                     output_file.write(">" + entry_candidate.id + "|1" + "\n")
                     output_file.write(str(entry_candidate.seq) + "\n")
                 else:
@@ -485,16 +484,22 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     return checked
 
-def clean_fas(path):
+def clean_fas(path, file_type):
     file = open(path, "r")
     lines = file.readlines()
     file.close()
     file = open(path,"w")
 
     for line in lines:
-        long_id, remain = line.split("#")
-        id = long_id.split("|")[0]
-        new_line = id + "#" + remain
+        if file_type == 'domains':
+            long_id, remain = line.split("#")
+            id = long_id.split("|")[0]
+            new_line = id + "#" + remain
+        else:
+            long_id, remain = line.split("\t")
+            id = long_id.split("|")[0]
+            new_line = id + "\t" + remain
+
         file.write(new_line)
 
 class Logger(object):
@@ -808,8 +813,9 @@ def main():
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, mode)
-        clean_fas(out + group + "_forward.domains")
-        clean_fas(out + group + "_reverse.domains")
+        clean_fas(out + group + "_forward.domains", 'domains')
+        clean_fas(out + group + "_reverse.domains", 'domains')
+        clean_fas(out + group + ".phyloprofile", 'phyloprofile')
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)

From 38aca29591e1a54430a7e395bad343657a13aef8 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 28 Apr 2021 15:58:34 +0200
Subject: [PATCH 082/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 23359d3..a021483 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -496,7 +496,7 @@ def clean_fas(path, file_type):
             id = long_id.split("|")[0]
             new_line = id + "#" + remain
         else:
-            long_id, remain = line.split("\t")
+            long_id, remain = line.split("\t", 1)
             id = long_id.split("|")[0]
             new_line = id + "\t" + remain
 

From f46cdc0e65616bf95a13f8e69268092584399419 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 11 May 2021 15:59:56 +0200
Subject: [PATCH 083/192] improve user output

---
 fdog/fDOGassembly.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a021483..d5184b2 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -149,8 +149,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             name = key + "_" + str(counter)
             # augutus call
             cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
-            #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-            starting_subprocess(cmd, mode)
+            #print(cmd)
+            starting_subprocess(cmd, 'silent')
             # transfer augustus output to as sequence
             cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
             starting_subprocess(cmd, mode)
@@ -396,6 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
                     output_file.write(str(entry_core.seq) + "\n")
 
     if sequenceIds != 0:
+        #print(sequenceIds)
         seq_records_candidate = readFasta(candidate_fasta)
         seq_records_candidate = list(seq_records_candidate)
         for entry_candidate in seq_records_candidate:
@@ -677,7 +678,7 @@ def main():
 
     print("Building a block profile for gene " + group + " \n")
     cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
-    starting_subprocess(cmd, mode)
+    starting_subprocess(cmd, 'silent')
 
     if int(os.path.getsize(profile_path)) > 0:
         print("block profile is finished \n")
@@ -689,7 +690,7 @@ def main():
         starting_subprocess(cmd, mode)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
         #print(cmd)
-        starting_subprocess(cmd, mode)
+        starting_subprocess(cmd, 'silent')
         print("block profile is finished \n")
 
     searchBool = False
@@ -798,7 +799,7 @@ def main():
             cmd = 'mkdir ' + tmp_path + 'anno_dir'
             starting_subprocess(cmd, 'silent')
             cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
-            starting_subprocess(cmd, mode)
+            starting_subprocess(cmd, 'silent')
     #if we searched in more than one Taxon and no ortholog was found
 
     if refBool == False and searchTaxon == '':
@@ -812,7 +813,7 @@ def main():
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
-        starting_subprocess(cmd, mode)
+        starting_subprocess(cmd, 'silent')
         clean_fas(out + group + "_forward.domains", 'domains')
         clean_fas(out + group + "_reverse.domains", 'domains')
         clean_fas(out + group + ".phyloprofile", 'phyloprofile')

From b662346b1a96358729427f630685957e60058ad5 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 13:20:19 +0200
Subject: [PATCH 084/192] fdog.assembly started with fDOG is always silent

---
 .DS_Store          | Bin 6148 -> 6148 bytes
 fdog/.DS_Store     | Bin 8196 -> 8196 bytes
 fdog/bin/oneSeq.pl |   2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.DS_Store b/.DS_Store
index 824f712743a6414728f27d69a840e656771a9cdf..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644
GIT binary patch
delta 34
pcmZoMXffDuo<-Ei&`3wY(8#P-N1@u#$lOpz!PLTh^Ai>YVF0V92}l3{

delta 34
ncmZoMXffDuo<-Eqz(hyE(Acn6N1@u#2*fcrG1>ftML`$<s|N`?

diff --git a/fdog/.DS_Store b/fdog/.DS_Store
index f638c26f261f65006cc1f23e73235d93f5bd4f0a..34e42555d35fd3e0f289e49c57c3fa62ffc1f870 100644
GIT binary patch
delta 159
zcmZp1XmOa}&nUVvU^hRb=w=>)ct%kpLn9ppLnE_V9ffK`BXdI?1yc+2%?*M}8727{
z(imJA{2AOC5*dntFqI*fA&DV}p>nc<fQSy$RR#tIz5ieUWHB&^0i{zJiWt&?q$8RJ
g17QU|W@ls#o0Wv4SvIpvd}Em`C>Xx^l*k4q0K|bT3IG5A

delta 49
zcmV-10M7q}K!iY$PXQCLP`eKS6SE8uUjdWD5=yhR5pV&suM=1Vk${S`2N?DQvxgOc
H0+E1+rmhjk

diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 7139af7..7e8a248 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -701,7 +701,7 @@
 		if ($assembly){
 			$eval_blast = sprintf("%f", $eval_blast);
 			if ($seqFile ne "") {
-				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath);
+				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
 
 				if (defined $assemblyPath){
 					push(@assembly_cmd, "--assemblyPath $assemblyPath")

From a751205c0bdc4832cb26a8955b3a04e05f332046 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 13:41:04 +0200
Subject: [PATCH 085/192] testing

---
 fdog/bin/oneSeq.pl   | 2 +-
 fdog/fDOGassembly.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 7e8a248..7139af7 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -701,7 +701,7 @@
 		if ($assembly){
 			$eval_blast = sprintf("%f", $eval_blast);
 			if ($seqFile ne "") {
-				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
+				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath);
 
 				if (defined $assemblyPath){
 					push(@assembly_cmd, "--assemblyPath $assemblyPath")
diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d5184b2..8884fba 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -396,7 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
                     output_file.write(str(entry_core.seq) + "\n")
 
     if sequenceIds != 0:
-        #print(sequenceIds)
+        print(sequenceIds)
         seq_records_candidate = readFasta(candidate_fasta)
         seq_records_candidate = list(seq_records_candidate)
         for entry_candidate in seq_records_candidate:

From eb9f585088bad8b476c02add5fbd8a78bead8c84 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 13:54:32 +0200
Subject: [PATCH 086/192] testing output

---
 fdog/fDOGassembly.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8884fba..4e9e6be 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -396,7 +396,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
                     output_file.write(str(entry_core.seq) + "\n")
 
     if sequenceIds != 0:
-        print(sequenceIds)
         seq_records_candidate = readFasta(candidate_fasta)
         seq_records_candidate = list(seq_records_candidate)
         for entry_candidate in seq_records_candidate:
@@ -800,6 +799,11 @@ def main():
             starting_subprocess(cmd, 'silent')
             cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
             starting_subprocess(cmd, 'silent')
+            clean_fas(out + group + "_forward.domains", 'domains')
+            clean_fas(out + group + "_reverse.domains", 'domains')
+            clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+
+
     #if we searched in more than one Taxon and no ortholog was found
 
     if refBool == False and searchTaxon == '':

From bb3c148b46b874865e67314a88b07b443c9dcfeb Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:00:32 +0200
Subject: [PATCH 087/192] testing

---
 fdog/fDOGassembly.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 4e9e6be..1b84a1e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -631,6 +631,10 @@ def main():
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
         out = out + '/' + group + '/'
+    else:
+        if out[-1] != "/":
+            out = out + "/"
+
 
     try:
         f = open(out + "/fdog.log", "a+")

From be2b9d4b3b1ea5a5e8ba214ff0c5d5754a4f82e8 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:01:43 +0200
Subject: [PATCH 088/192] testing

---
 fdog/mergeAssemblyOutput.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
index ea6e084..11d5c36 100644
--- a/fdog/mergeAssemblyOutput.py
+++ b/fdog/mergeAssemblyOutput.py
@@ -107,6 +107,7 @@ def main():
                 set_fasta = header
             if cleanup == True:
                 os.remove(directory + '/' +infile)
+                os.system("rm *.tsv")
 
     if phyloprofile:
         phyloprofile.close()

From 6fbd5aadcc9ee3151ddfd1fb75a8e882b83bf1b2 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:06:19 +0200
Subject: [PATCH 089/192] testing

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1b84a1e..de9f343 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -803,9 +803,9 @@ def main():
             starting_subprocess(cmd, 'silent')
             cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
             starting_subprocess(cmd, 'silent')
-            clean_fas(out + group + "_forward.domains", 'domains')
-            clean_fas(out + group + "_reverse.domains", 'domains')
-            clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+            clean_fas(fasOutFile + "_forward.domains", 'domains')
+            clean_fas(fasOutFile + "_reverse.domains", 'domains')
+            clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
 
 
     #if we searched in more than one Taxon and no ortholog was found

From 34d683c8aaa9529344b070a0fdccaebce77a10f3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:07:48 +0200
Subject: [PATCH 090/192] testing

---
 fdog/mergeAssemblyOutput.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
index 11d5c36..79a1306 100644
--- a/fdog/mergeAssemblyOutput.py
+++ b/fdog/mergeAssemblyOutput.py
@@ -107,7 +107,7 @@ def main():
                 set_fasta = header
             if cleanup == True:
                 os.remove(directory + '/' +infile)
-                os.system("rm *.tsv")
+                os.system("rm  " + directory + "/*.tsv")
 
     if phyloprofile:
         phyloprofile.close()

From f9504745c247595c867669695d8b302fd30571a7 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:14:29 +0200
Subject: [PATCH 091/192] testing

---
 fdog/bin/oneSeq.pl          | 2 +-
 fdog/mergeAssemblyOutput.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 7139af7..7e8a248 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -701,7 +701,7 @@
 		if ($assembly){
 			$eval_blast = sprintf("%f", $eval_blast);
 			if ($seqFile ne "") {
-				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath);
+				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
 
 				if (defined $assemblyPath){
 					push(@assembly_cmd, "--assemblyPath $assemblyPath")
diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
index 79a1306..6c865a1 100644
--- a/fdog/mergeAssemblyOutput.py
+++ b/fdog/mergeAssemblyOutput.py
@@ -107,7 +107,7 @@ def main():
                 set_fasta = header
             if cleanup == True:
                 os.remove(directory + '/' +infile)
-                os.system("rm  " + directory + "/*.tsv")
+                os.system("rm  " + directory + "/'*.tsv'")
 
     if phyloprofile:
         phyloprofile.close()

From 0b129a293cd1fcf30770883e1796bd830f8e4dee Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 31 May 2021 14:28:05 +0200
Subject: [PATCH 092/192] removing automatically .tsv files if existing

---
 fdog/mergeAssemblyOutput.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
index 6c865a1..1606b1d 100644
--- a/fdog/mergeAssemblyOutput.py
+++ b/fdog/mergeAssemblyOutput.py
@@ -107,7 +107,8 @@ def main():
                 set_fasta = header
             if cleanup == True:
                 os.remove(directory + '/' +infile)
-                os.system("rm  " + directory + "/'*.tsv'")
+        elif infile.endswith('.tsv'):
+            os.remove(directory + '/' + infile)
 
     if phyloprofile:
         phyloprofile.close()

From 6c6b1258f1376b0cff530e1492c7a40200946915 Mon Sep 17 00:00:00 2001
From: mueli94 <47216555+mueli94@users.noreply.github.com>
Date: Mon, 31 May 2021 15:35:20 +0200
Subject: [PATCH 093/192] Fdog goes assembly (#8)

* testing

* shorten long header for addTaxon, check for long headers in oneseq and checkData

* testing

* testing

* testing

* changed path in hamstr.pl to current directory

* changed path in hamstr.pl to current directory

* testing

* testing

* testing

* testing

* testing

* testing

* bug fix

* bug fix

* fixed error mapping ID file not found

* testing

* testing

* testing

* test

* test

* testing

* testing

* testing

* testing

* fDOGassembly is working on complete assembly_dir

* bug fix

* bug fix

* enabled option -filter for blastp search

* bug fix fasoff

* testing --strict option

* bug fix in --strict option, output is corrected

* bug fix in --checkCoorthologsRef

* bug fix

* clean up

* bug fix

* adapted handling of variable dataPath

* testing

* testing

* testing

* testing

* test

* test

* test

* test

* test

* test

* testing

* bug fix assemblyDir

* testing

* testing

* testing search taxa

* test

* enable --searchTaxa option in fdog.assembly

* bug fix

* testing

* testing --searchTaxa adaption

* testing

* test

* test

* write debug files to output dir

* skip fa.mapping while checking genome_dir

* testing

* bug fix

* testing

* bug fix

* bug fix

* path fix in augustus_ppx

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* testing

* testing

* added new python script to merge Assembly output from the same Gene but different searchTaxa

* added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa

* bug fix

* corrected fdog.mergeAssembly call

* testing

* testing

* testing

* test

* moved fdog.mergeAssembly call to another place

* testing

* testing

* testing

* testing

* testing

* testing

* corrected fdog.mergeAssembly call

* testing

* testing

* testing

* testing

* test

* disable weight_dir check if option --assembly is used

* adapted fdog.assembly call

* adapted calcFAS call to deactivate .tsv output

* testing

* testing

* bug fix in function backward search used with option --strict

* testing new added option --silent

* added more checks to fdogs.run

* bug fix

* testing

* testing

* testing

* bug fix

* bug fix

* testing

* testing silent mode

* testing --silent

* symlinks for fasta36 input; improved fdogs.run according to #5

* testing

* testing

* testing

* testing

* tetsing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* test

* test

* testing

* testing new function to identify coorthologs

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* finished function coorthologs

* bug fix runSingle.py

* cleaning output

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* bug fix if augutus can't idetify a gene at a candidate region

* testing

* bug fix

* bug fix

* cleaning up

* testing

* testing

* testing

* testing

* bug fix in merge function, regions in minus strand were not merged correctly

* testing

* testing

* testing

* testing

* testing

* bug fix

* testing

* testing

* testing

* testing

* testing

* clean up

* testing

* testing

* testing

* testing

* bug fix

* testing new tblastn call

* testing

* testing

* testing

* testing

* testing

* code clean up

* clean up code

* clean up

* clean up

* reduce output

* clean up code

* check augustus

* testing

* adding option to recognize if co-ortholog or not in header of the extended.fa

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* added function starting_subprocess() to handle call of extern tools more easily

* added augustus to dependencies

* testing

* bug fix

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* added function to clean up .domain files

* testing

* testing

* testing

* testing

* improve user output

* fdog.assembly started with fDOG is always silent

* testing

* testing output

* testing

* testing

* testing

* testing

* testing

* removing automatically .tsv files if existing

Co-authored-by: trvinh <trvinh@gmail.com>
---
 .DS_Store                               | Bin 6148 -> 6148 bytes
 .github/workflows/github_build.yml      |  51 ++
 .gitignore                              |   3 +-
 .travis.yml                             |  23 +-
 README.md                               |   3 +-
 fdog/.DS_Store                          | Bin 8196 -> 8196 bytes
 fdog/addTaxa.py                         |   3 +-
 fdog/addTaxon.py                        |  16 +-
 fdog/bin/hamstr.pl                      | 103 +--
 fdog/bin/oneSeq.pl                      | 340 ++++++----
 fdog/checkData.py                       |  11 +-
 fdog/data/.DS_Store                     | Bin 8196 -> 6148 bytes
 fdog/fDOGassembly.py                    | 837 ++++++++++++++++++++++++
 fdog/fdog_goes_assembly/.DS_Store       | Bin 6148 -> 0 bytes
 fdog/fdog_goes_assembly/fDOGassembly.py | 209 ------
 fdog/mergeAssemblyOutput.py             | 124 ++++
 fdog/runMulti.py                        | 207 ++++--
 fdog/runSingle.py                       | 149 ++++-
 fdog/setup/setup.sh                     |   3 +-
 fdog/setup/setup_conda.sh               |   6 +-
 fdog/setupfDog.py                       |   6 +-
 setup.py                                |  10 +-
 22 files changed, 1619 insertions(+), 485 deletions(-)
 create mode 100644 .github/workflows/github_build.yml
 create mode 100644 fdog/fDOGassembly.py
 delete mode 100644 fdog/fdog_goes_assembly/.DS_Store
 delete mode 100644 fdog/fdog_goes_assembly/fDOGassembly.py
 create mode 100644 fdog/mergeAssemblyOutput.py

diff --git a/.DS_Store b/.DS_Store
index c84405d9d29ae54bb91cc188eb50403196c8adc3..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644
GIT binary patch
delta 171
zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7i&&T#IVSTk*)yJ+?8DTcC0Si<WN4(LU}$7k
ztD{hDXk>1vqhM-bUR%q_A+Bm@>zR;SSyf$ATQ_U+S0;JJ*^@<?AL{ZklrSVRR4}A4
zlrrQp6fo$4`9(lFouSGzCqFqUCqIdSfkA+Qfhm3SbLPj4o7p+|Ie<>t{E_)P^JIPz
RM-E1y98d|v<_M8B%m6dnE297a

delta 121
zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3PWEAG(3GgIHZ(BNQ7|+%tkqGd
zwlp%(Q7|*KtgYqb5LY#{^-RdEtg5c5t(!Ud8;d;StjS`m4>yZ(h_Y<v=HTW48oXJM
Y<2&<Yei26w1|VQ$U|`uCA+m-U0J;Dk?*IS*

diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml
new file mode 100644
index 0000000..14f1a20
--- /dev/null
+++ b/.github/workflows/github_build.yml
@@ -0,0 +1,51 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: build
+
+on:
+  push:
+    branches: [ master ]
+    tags:
+    - '*'
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest wheel
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test
+      run: |
+        pwd
+        pip install .
+        fdog.setup -o /home/runner/work/fDOG/fDOG/dt --lib
+        fdog.setup -o /home/runner/work/fDOG/fDOG/dt
+        fdog.showTaxa
+        fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 --fasoff
+    - name: Deploy
+      if: startsWith(github.event.ref, 'refs/tags')
+      uses: casperdcl/deploy-pypi@v2
+      with:
+        password: ${{ secrets.pypi }}
+        build: true
+        upload: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') }}
diff --git a/.gitignore b/.gitignore
index 3a64523..38cf321 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,4 +131,5 @@ dmypy.json
 #Hannah
 /fdog/data/core_orthologs/
 /fdog/data/assembly_dir/
-/fdog/fdog_goes_assembly/tmp/
+/fdog/fdog_goes_assembly/tmp/ 
+taxdump*
diff --git a/.travis.yml b/.travis.yml
index b841459..29761d6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,22 @@
 language: python
 python:
 - '3.8'
+sudo: required
 install:
 - pip install .
-script: python3 fdog/runSingle.py -h
-deploy:
-  provider: pypi
-  username: __token__
-  password:
-    secure: or/c8/8BANMAQRSa6W3d1CV4mNBULQMR1SJ9l8YSR5o2ufcKO43Jf6apZq+0qb4805516ihpEGceKJ1aiM7bMHru66qlqtj/Jv1QOr40bjsbfCXlHXp8sJ74boWIhk3BzQCtbM1CbO6IKceITNmHslwABMNRSSOELYoJoxJ9w4NfsiYdeVPu1TNSzxRfLzNI3q2Ku2uei+vLlDxvpjYeasKfQlWBb/aydFo8TI403ftmIIeX9ATOmWHssRYGYnNKE8XHU5AX4ka5gzm1fzgjyv4h3NmD0MAaJlEJjdYYE0WGvRvmKEMtUzlCAcQnyU1ALVm4mC/4fAceuVfr7uvRZojVlS2zdTXFQpkHR6dwZ0B74+Z3p2bSdjTV+glt1nZ3DUpxWfaCrFX8viIZE30HtMaGD3Gii142cK9kdpmBQDrmPlVdinKz/NI7bY39/lWcTX4PO0SQsZUoXlb0JmVYGwPpvlAk83Ba7ZglQ802PWI2AEPXdnlvttSP7+2xfG5B/2CRpt+EzIaw/zl1hfy8ZEHfJT1h8hQXNsbqNKELd1A0Q9L/5x8YQ/TIcm/gpPjmT6Exx5qNJ91lNhQE9MordjzjT4uU91v1e3NZa/Ar39ZH6I/wtW44rcGqpqKqalGGzQ8fNScasoE9POCVNiA3GnjYe4PZqlS3sr8mM+G6+3A=
-  on:
-    tags: true
-
+script:
+- fdog.setup -o $TRAVIS_BUILD_DIR --lib
+- fdog.setup -o $TRAVIS_BUILD_DIR
+- fdog.showTaxa
+- fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 --fasoff
 branches:
   only:
     - master
+    - /^v.*$/
+# deploy:
+#   provider: pypi
+#   username: __token__
+#   password:
+#     secure: or/c8/8BANMAQRSa6W3d1CV4mNBULQMR1SJ9l8YSR5o2ufcKO43Jf6apZq+0qb4805516ihpEGceKJ1aiM7bMHru66qlqtj/Jv1QOr40bjsbfCXlHXp8sJ74boWIhk3BzQCtbM1CbO6IKceITNmHslwABMNRSSOELYoJoxJ9w4NfsiYdeVPu1TNSzxRfLzNI3q2Ku2uei+vLlDxvpjYeasKfQlWBb/aydFo8TI403ftmIIeX9ATOmWHssRYGYnNKE8XHU5AX4ka5gzm1fzgjyv4h3NmD0MAaJlEJjdYYE0WGvRvmKEMtUzlCAcQnyU1ALVm4mC/4fAceuVfr7uvRZojVlS2zdTXFQpkHR6dwZ0B74+Z3p2bSdjTV+glt1nZ3DUpxWfaCrFX8viIZE30HtMaGD3Gii142cK9kdpmBQDrmPlVdinKz/NI7bY39/lWcTX4PO0SQsZUoXlb0JmVYGwPpvlAk83Ba7ZglQ802PWI2AEPXdnlvttSP7+2xfG5B/2CRpt+EzIaw/zl1hfy8ZEHfJT1h8hQXNsbqNKELd1A0Q9L/5x8YQ/TIcm/gpPjmT6Exx5qNJ91lNhQE9MordjzjT4uU91v1e3NZa/Ar39ZH6I/wtW44rcGqpqKqalGGzQ8fNScasoE9POCVNiA3GnjYe4PZqlS3sr8mM+G6+3A=
+#   on:
+#     tags: true
diff --git a/README.md b/README.md
index e210620..52f11e2 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![PyPI version](https://badge.fury.io/py/fdog.svg)](https://pypi.org/project/fdog/)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 [![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG)
+![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg)
 
 # Table of Contents
 * [How to install](#how-to-install)
@@ -41,7 +42,7 @@ export PATH=$HOME/.local/bin:$PATH
 
 After installing *fdog*, you need to setup *fdog* to get its dependencies and pre-calculated data.
 
-**NOTE**: in case you haven't installed [greedyFAS](https://github.com/BIONF/FAS) before, it will be installed automatically within *fDOG* setup. However, you need to run [setupFAS](https://github.com/BIONF/FAS/wiki/setupFAS) after *fDOG* setup finished before actually using *fDOG*! 
+**NOTE**: in case you haven't installed [greedyFAS](https://github.com/BIONF/FAS) before, it will be installed automatically within *fDOG* setup. However, you need to run [setupFAS](https://github.com/BIONF/FAS/wiki/setupFAS) after *fDOG* setup finished before actually using *fDOG*!
 
 You can setup fDOG by running this command
 ```
diff --git a/fdog/.DS_Store b/fdog/.DS_Store
index 8a155421181e7c2b24e87c68d5d9f3db5ec8efd7..34e42555d35fd3e0f289e49c57c3fa62ffc1f870 100644
GIT binary patch
delta 377
zcmZp1XmOa}&nUVvU^hRb=wu#&latpAu=6C97Z)Vu<R>vOFr1p~CD@=PSzT>pXr!ZH
zXk=Eaqfl*VWNxUVU}|ArTg%BIu4-uOnUGsqRb5kCH*2zwu)O$ephjkfB!*0eyt3e;
zyqx^Jbf5_1<VynOjBo=D4NQOr8XH0k1aZtwOt2eh3Nny|A%!84p@bn3*+`%`KSLUW
z3xhv{J5aV52vZqyfmY`*R8Ce95Yb_}3RI%^9}Ivj1_m*pbSgs;LpqRjMANWNSb>k(
y8Ck>T1;T=i+^k?@OBhNS3MShL-VjIPBRgjEI*|aT&Fm82SSFtlT8u}PFB1SkZfHaR

delta 203
zcmZp1XmOa}&nUDpU^hRb&}1Hglao&imQ21Za&|JWu!OP@LmER0Lq0<~Lp%`YGo&&U
z16hecvKYwDWk_PknS4=H#E4<ue=uMG0U|UpR8Br2a>I})m1V(2c{%xc=|GbiH?I*E
cWZb+<)R$>v11IBVc8PB+o7uz|fjl-w0N=DXc>n+a

diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py
index 1c83bb5..d392c8c 100644
--- a/fdog/addTaxa.py
+++ b/fdog/addTaxa.py
@@ -95,7 +95,7 @@ def runAddTaxon(args):
         sys.exit('Problem running\n%s' % (cmd))
 
 def main():
-    version = '0.0.1'
+    version = '0.0.5'
     parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.')
     required = parser.add_argument_group('required arguments')
     optional = parser.add_argument_group('optional arguments')
@@ -125,6 +125,7 @@ def main():
             sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
         with open(pathconfigFile) as f:
             outPath = f.readline().strip()
+    outPath = os.path.abspath(outPath)
     noAnno = args.noAnno
     coreTaxa = args.coreTaxa
     oldFAS = args.oldFAS
diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py
index e09f1e4..fe0a810 100755
--- a/fdog/addTaxon.py
+++ b/fdog/addTaxon.py
@@ -77,13 +77,13 @@ def runBlast(args):
         subprocess.call([blastCmd], shell = True)
     except:
         sys.exit('Problem with running %s' % blastCmd)
-    fileInGenome = "%s/genome_dir/%s/%s.fa" % (outPath, specName, specName)
+    fileInGenome = "../../genome_dir/%s/%s.fa" % (specName, specName)
     fileInBlast = "%s/blast_dir/%s/%s.fa" % (outPath, specName, specName)
     if not Path(fileInBlast).exists():
         os.symlink(fileInGenome, fileInBlast)
 
 def main():
-    version = '0.0.2'
+    version = '0.0.5'
     parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.')
     required = parser.add_argument_group('required arguments')
     optional = parser.add_argument_group('optional arguments')
@@ -115,6 +115,7 @@ def main():
             sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
         with open(pathconfigFile) as f:
             outPath = f.readline().strip()
+    outPath = os.path.abspath(outPath)
     noAnno = args.noAnno
     coreTaxa = args.coreTaxa
     ver = str(args.verProt)
@@ -152,10 +153,13 @@ def main():
             seq = str(inSeq[id].seq)
             # check ID
             id = re.sub('\|', '_', id)
-            if len(id) > 80:
-                # modIdIndex = modIdIndex + 1
-                # id = specName + "_" + str(modIdIndex)
+            oriId = id
+            if len(id) > 30:
+                modIdIndex = modIdIndex + 1
+                id = specName + "_" + str(modIdIndex)
                 longId = 'yes'
+                with open(specFile + '.mapping', 'a') as mappingFile:
+                    mappingFile.write('%s\t%s\n' % (id, oriId))
             if not id in tmpDict:
                 tmpDict[id] = 1
             else:
@@ -184,7 +188,7 @@ def main():
         cf.close()
         # warning about long header
         if longId == 'yes':
-            print('\033[91mWARNING: Headers are longer than 80 characters. It could cause some troubles!\033[0m')
+            print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile)
     else:
         print(genomePath + '/' + specName + '.fa already exists!')
 
diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl
index 37ae73a..7ff125e 100755
--- a/fdog/bin/hamstr.pl
+++ b/fdog/bin/hamstr.pl
@@ -193,9 +193,11 @@
 ## 13.07.2020 (v13.3.0 - vinh) solved problem when gene ID contains PIPE
 ## 22.07.2020 (v13.4.0 - vinh) moved tmp blast files to output folder and delete them when finished
 ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef
+## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name
+## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory
 
 ######################## start main ###########################################
-my $version = "HaMStR v.13.4.1";
+my $version = "HaMStR v.13.4.4";
 ######################## checking whether the configure script has been run ###
 my $configure = 0;
 if ($configure == 0){
@@ -214,8 +216,9 @@
 my $filter = 'F'; # low complexity filter switch. Default 'on'. Set of 'F' to turn off permanently.
 my $eval_blast = 10; # default evalue cutoff for the blast search
 ########## EDIT THE FOLLOWING LINES TO MODIFY DEFAULT PATHS ###################
-my $path = abs_path(dirname(__FILE__));
-$path =~ s/\/bin//;
+# my $path = abs_path(dirname(__FILE__));
+# $path =~ s/\/bin//;
+my $path = getcwd;
 my $hmmpath = "$path/core_orthologs"; #path where the hmms are located
 my $blastpath = "$path/blast_dir"; #path to the blast-dbs
 my $outpath = '.';
@@ -223,10 +226,10 @@
 my $hmm_dir = 'hmm_dir';
 my $fa_dir  = 'fa_dir';
 ##############################
-my $termios = new POSIX::Termios; $termios->getattr;
-my $ospeed = $termios->getospeed;
-my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed };
-my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/;
+# my $termios = new POSIX::Termios; $termios->getattr;
+# my $ospeed = $termios->getospeed;
+# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed };
+# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/;
 
 ############################## Variables ##############
 my $fileobj;
@@ -322,16 +325,16 @@
 }
 ## help message
 my $helpmessage = "
-${bold}YOU ARE RUNNING $version on $hostname$norm
+YOU ARE RUNNING $version on $hostname
 
 This program is freely distributed under a GPL.
 Copyright (c) GRL limited: portions of the code are from separate copyrights
 
-\n${bold}USAGE:${norm} hamstr -sequence_file=<> -hmmset=<> -taxon=<>  -refspec=<> [OPTIONS]
+\nUSAGE: hamstr -sequence_file=<> -hmmset=<> -taxon=<>  -refspec=<> [OPTIONS]
 
-${bold}OPTIONS:$norm
+OPTIONS:
 
-${bold}REQUIRED$norm
+REQUIRED
 -sequence_file=<>
 		path and name of the file containing the sequences hmmer is run against.
 -hmmset=<>
@@ -359,7 +362,7 @@
 		set this flag if you are searching in protein sequences. Note, if neither the -est nor the -protein flag is set, HaMStR will
 		guess the sequence type.
 
-${bold}USING NON-DEFAULT PATHS$norm
+USING NON-DEFAULT PATHS
 
 -blastpath=<>
 		Lets you specify the absolute or relative path to the blast databases. DEFAULT: $blastpath
@@ -368,7 +371,7 @@
 -outpath=<>
 		You can determine the path to the HaMStR output. Default: current directory.
 
-${bold}ADDITIONAL OPTIONS$norm
+ADDITIONAL OPTIONS
 
 -append
 		set this flag if the output should be appended to the files *.out and *_cds.out. This becomes relevant when running
@@ -412,7 +415,7 @@
 -hmm
 		Option to provide only a single hmm to be used for the search.
 		Note, this file has to end with .hmm
--intron=<${bold}k${norm}eep|${bold}m${norm}ask|${bold}r${norm}emove>
+-intron=<keep|mask|remove>
 		Specify how to deal with introns that may occur in transcript sequences. Default: keep - Introns will be retained in the transcript
 		but will be identified by lower case letters.
 -longhead
@@ -512,7 +515,7 @@
 ## 1) check if all information is available to run HaMStR
 ($check, @log) = &checkInput();
 if ($check == 0) {
-	print "\n\n${bold}There was an error running $version$norm\n\n";
+	print "\n\nThere was an error running $version\n\n";
 	print join "\n", @log;
 	exit;
 }
@@ -783,11 +786,11 @@ sub checkInput {
 		my @coresets = (`ls $hmmpath`);
 		chomp @coresets;
 		if (scalar(@coresets > 0)){
-			print "\n${bold}THE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:${norm}\n\n";
+			print "\nTHE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:\n\n";
 			for (my $i = 0; $i < @coresets; $i++){
 				my @available = qw();
 				my @unavailable = qw();
-				print "\n${bold}$coresets[$i]${norm}\n\n";
+				print "\n$coresets[$i]\n\n";
 				my @refspec = `head -n 20 $hmmpath/$coresets[$i]/$coresets[$i].fa |$grepprog '>' |cut -d '|' -f 2 |sort |uniq`;
 				chomp @refspec;
 				for (my $j = 0; $j < @refspec; $j++){
@@ -807,7 +810,7 @@ sub checkInput {
 			}
 		}
 		else {
-			print "\n${bold}NO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!${norm}\n\n";
+			print "\nNO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!\n\n";
 		}
 		print "\n\n";
 		exit;
@@ -873,12 +876,17 @@ sub checkInput {
 
 	}
 	$dbfile =~ s/.*\///;
-	$dbfile_short = $dbfile;
-	$dbfile_short =~ s/\..*//;
+	# $dbfile_short = $dbfile;
+	# $dbfile_short =~ s/\..*//;
+	my @dbfileTMP = split(/\./, $dbfile); pop @dbfileTMP;
+	$dbfile_short = join(".", @dbfileTMP);
 	if ($central) {
 		$dboutpath = $dbpath;
 		# print "setting dboutpath to $dboutpath";
 	}
+
+	# print "HERERERERERERERERER $dbfile #################\n";
+	# print "THENNNNNNNNNNNNNNNN $dbfile_short #################\n";
 	##
 	## 0) Check for presence of the file with the sequences that should be hamstered
 	if (-e "$dbpath/$dbfile") {
@@ -886,7 +894,7 @@ sub checkInput {
 	}
 	else {
 		#the provided infile does not exist:
-		push @log, "${bold}FATAL:${norm} The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n";
+		push @log, "FATAL: The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n";
 		$check = 0;
 		return ($check, @log);
 	}
@@ -952,7 +960,7 @@ sub checkInput {
 			push @log, "Translated file already exists, using this one";
 		}
 		if (! -e "$dboutpath/$dbfile") {
-			push @log, "${bold}FATAL:${norm} The translation of $dbfile_base failed. Check the script translate.pl";
+			push @log, "FATAL: The translation of $dbfile_base failed. Check the script translate.pl";
 			print "failed\n";
 			$check = 0;
 		}
@@ -965,7 +973,7 @@ sub checkInput {
 	push @log, "\nCHECKING FOR PROGRAMS\n";
 	printOUT("checking for the blast program:\t");
 	if (`which $blast_prog` =~ / no /) {
-		push @log, "${bold}FATAL:${norm} could not execute $blast_prog. Please check if this program is installed and executable";
+		push @log, "FATAL: could not execute $blast_prog. Please check if this program is installed and executable";
 		print "failed\n";
 		$check = 0;
 	}
@@ -979,12 +987,12 @@ sub checkInput {
 	printOUT("checking for hmmsearch:\t");
 	my $hmmcheck = `$prog -h |$grepprog -c 'HMMER 3'`;
 	if (! `$prog -h`) {
-		push @log, "${bold}FATAL:${norm} could not execute $prog. Please check if this program is installed and executable";
+		push @log, "FATAL: could not execute $prog. Please check if this program is installed and executable";
 		print "failed: $prog is not installed or not executable\n";
 		$check = 0;
 	}
 	elsif ($hmmcheck != 1) {
-		push @log, "${bold}FATAL:${norm} It seems that $prog is not from the HMMER 3 package. Please check!";
+		push @log, "FATAL: It seems that $prog is not from the HMMER 3 package. Please check!";
 		print "failed: $prog is not from the HMMER 3 package\n";
 		$check = 0;
 	}
@@ -996,14 +1004,14 @@ sub checkInput {
 	if ($check_genewise) {
 		printOUT("checking for genewise:\t");
 		if (! `genewise -help`) {
-			push @log, "${bold}FATAL:${norm} Could not execute genewise. Please check if this program is installed and executable";
+			push @log, "FATAL: Could not execute genewise. Please check if this program is installed and executable";
 			print "failed: genewise is not executable\n";
 			$check = 0;
 		}
 		else {
 			my $gwcheck = `echo \$WISECONFIGDIR`;
 			if (length($gwcheck) < 1) {
-				push @log, "${bold}FATAL:${norm} The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise.
+				push @log, "FATAL: The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise.
 				Please consult the installation manual for genewise and set this variable";
 				print "failed: the environmental variable WISECONFIGDIR has not been set.\n";
 				$check = 0;
@@ -1014,14 +1022,14 @@ sub checkInput {
 		}
 	}
 	else {
-		push @log, "${bold}GENEWISE-CHECK skipped:${norm} The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1";
+		push @log, "GENEWISE-CHECK skipped: The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1";
 	}
 	## 4) Check for presence of the directory structure
 
 	push @log, "\nCHECKING FOR HMMs\n";
 	printOUT("checking for presence of the hmm files:\t");
 	if ( ! defined $hmmset or ! -e "$hmmpath/$hmmset") {
-		push @log, "${bold}FATAL:${norm} You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets.";
+		push @log, "FATAL: You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets.";
 		print "failed\n";
 		$check = 0;
 	}
@@ -1033,7 +1041,7 @@ sub checkInput {
 
 		## 4b) check for the presence of the hmm-files and the fasta-file
 		if (!(-e "$hmm_dir")) {
-			push @log, "${bold}FATAL:${norm} Could not find $hmm_dir";
+			push @log, "FATAL: Could not find $hmm_dir";
 			print "failed\n";
 			$check = 0;
 		} else {
@@ -1043,7 +1051,7 @@ sub checkInput {
 				### check for the presence of all hmms
 				for (my $k = 0; $k < @hmms; $k++) {
 					if (! -e "$hmm_dir/$hmms[$k]") {
-						push @log, "${bold}FATAL:${norm} $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]";
+						push @log, "FATAL: $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]";
 						$check = 0;
 						last;
 					} else {
@@ -1073,7 +1081,7 @@ sub checkInput {
 		}
 	}
 	else {
-		push @log, "${bold}FATAL:${norm} Please provide path and name of fasta file containing the core-ortholog sequences";
+		push @log, "FATAL: Please provide path and name of fasta file containing the core-ortholog sequences";
 		$check = 0;
 		print "failed\n";
 	}
@@ -1086,7 +1094,7 @@ sub checkInput {
 		$taxon_check = 2;
 	}
 	else {
-		push @log, "${bold}FATAL:${norm} No taxon_file found. Please provide a global taxon name using the option -taxon";
+		push @log, "FATAL: No taxon_file found. Please provide a global taxon name using the option -taxon";
 		print "failed\n";
 		$check = 0;
 	}
@@ -1094,7 +1102,7 @@ sub checkInput {
 	push @log, "\nCHECKING FOR REFERENCE TAXON\n";
 	printOUT("checking for reference species and blast-dbs:\t");
 	if (!(defined $refspec_string) and (! defined $strict and ! defined $relaxed)) {
-		push @log, "${bold}FATAL:${norm} Please provide a reference species for the reblast!";
+		push @log, "FATAL: Please provide a reference species for the reblast!";
 		print "failed\n";
 		$check = 0;
 	}
@@ -1146,7 +1154,7 @@ sub checkInput {
 			printOUT("succeeded\n");
 		}
 		else {
-			push @log, "${bold}FATAL:${norm} please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist.";
+			push @log, "FATAL: please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist.";
 			print "$blastpathtmp failed\n";
 			$check = 0;
 		}
@@ -1174,7 +1182,7 @@ sub checkInput {
 			push  @log, "\tinfile ready";
 		} else {
 			#the provided reference fasta file does not exist or link to file does not exist:
-			push @log, "${bold}FATAL:${norm} FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n";
+			push @log, "FATAL: FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n";
 			$check = 0;
 			return ($check, @log);
 		}
@@ -1241,7 +1249,7 @@ sub checkInput {
 	printOUT("checking for low complexity filter setting:\t");
 	$filter =~ tr/ft/FT/;
 	if ($filter ne 'T' and $filter ne 'F') {
-		push @log, "${bold}FATAL:${norm} Filter is set to $filter. Please set the low complexity filter either to F or T.";
+		push @log, "FATAL: Filter is set to $filter. Please set the low complexity filter either to F or T.";
 		print "low complexity filter check failed\n";
 		$check = 0;
 	}
@@ -1283,12 +1291,10 @@ sub checkInput {
 			`rm -rf "$fa_dir_neu"`;
 			`mkdir "$fa_dir_neu"`;
 		}
-		if (!(-d "$tmpdir")) {
-			`mkdir "$tmpdir"`;
-		}
-		elsif (-d "$tmpdir" and $cleartmp) {
+		mkdir "$tmpdir" unless -d "$tmpdir";
+		if (-d "$tmpdir" and $cleartmp) {
 			`rm -rf "$tmpdir"`;
-			`mkdir "$tmpdir"`;
+			mkdir "$tmpdir" unless -d "$tmpdir";
 		}
 	}
 	## 14) determin whether or not the -representative flag has been set
@@ -1401,23 +1407,23 @@ sub check4reciprocity {
 			my $suc = 0; # keeps track of success for a single taxon
 			if ($checkCoRef == 0) {
 				## the user does not want to check further in case that id of best blast hit and of reference species differ
-				printOUT("core_orthologs: ", join "\t", @original_ids , "\n");
+				printOUT("core_orthologs: @original_ids\n");
 				## now loop through the best hits with the same score and check whether
 				## among these I find the same seq as in $original
 				my $i = 0;
 				while ($suc == 0 and $i <@$hits) {
-					printOUT("blast-hit: $hits->[$i]->{name}");
+					printOUT("blast-hit: $hits->[$i]->{name}\n");
 					## now loop through all the refspec-sequences in the hmm file; this is the case when co-orthologs have been determine in the core-ortholog
 					my $j = 0;
 					while ($suc == 0 and $j < @original_ids) {
 						if ($original_ids[$j] eq $hits->[$i]->{name}) {
-							printOUT("\thitting\n");
+							printOUT("hitting $original_ids[$j]\n");
 							$refspec_final->[$k]->{hit} = $j;
 							$suc = 1;
 							$relaxed_suc = 1;
 						}
 						else {
-							printOUT("\nnot hitting $original_ids[$j]\n");
+							printOUT("not hitting $original_ids[$j]\n");
 							$j ++;
 						}
 						if ($suc == 1) {
@@ -1468,7 +1474,7 @@ sub check4reciprocity {
 					}
 					## print distances (debug mode)
 					if ($debug){
-						my $distDebugFile = $path . "/output/" . $taxon_global . ".debug.dist";
+						my $distDebugFile = $outpath . "/" . $taxon_global . ".debug.dist"; #$path . "/output/" . $taxon_global . ".debug.dist";
 						unless (-e $distDebugFile){
 							open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile";
 							print $DISTDEBUG "hmmset\trefid\tbestid\tqueryid\tqhdist\trhdist\n";
@@ -2024,9 +2030,8 @@ sub determineRefspecFinal {
 	my $ac = 0;
 	for (my $i = 0; $i < @refspec; $i++) {
 		$fafile =~ s/\|/\\\|/g;
-		@original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile |$sedprog -e "s/.*$refspec[$i]\|//"`;
+		@original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile | grep -v "^\-\-\$" |$sedprog -e "s/.*$refspec[$i]\|//"`;
 		chomp @original;
-
 		if (@original > 0) {
 			$refspec_final->[$ac]->{refspec} = $refspec[$i];
 			$refspec_final->[$ac]->{searchdb} = "$blastpath/$refspec[$i]/$refspec[$i]" . $blastapp;
diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 61cae86..7e8a248 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -121,9 +121,15 @@
 ## Modified 22. Sep 2020 v2.2.1 (Vinh)	- make sure that seed sequence always at the beginning of extended.fa output
 ## Modified 23. Sep 2020 v2.2.3 (Vinh)	- use full taxonomy name instead of abbr taxon name for LOG
 ## Modified 01. Dec 2020 v2.2.4 (Vinh)	- fixed bug while creating final extended.fa (and replaced grep and sed by bioperl)
+## Modified 16. Feb 2021 v2.2.5 (Vinh)	- core compilation works with fasoff
+## Modified 18. Feb 2021 v2.2.6 (Vinh)	- fixed searchTaxa and coreTaxa options
+## Modified 19. March 2021 v2.2.7 (Vinh)	- check for long sequence ID
+## Modified 24. March 2021 v2.2.8 (Vinh)	- skip fa.mapping while checking genome_dir
+## Modified 29. March 2021 v2.2.9 (Vinh)	- check for zero $maxAlnScore
+##                                        - solved problem with long input path for fasta36 tools
 
 ############ General settings
-my $version = 'oneSeq v.2.2.4';
+my $version = 'oneSeq v.2.2.9';
 ##### configure for checking if the setup.sh script already run
 my $configure = 0;
 if ($configure == 0){
@@ -133,10 +139,10 @@
 my $hostname = `hostname`;
 chomp $hostname;
 #############
-my $termios = new POSIX::Termios; $termios->getattr;
-my $ospeed = $termios->getospeed;
-my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed };
-my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/;
+# my $termios = new POSIX::Termios; $termios->getattr;
+# my $ospeed = $termios->getospeed;
+# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed };
+# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/;
 #### Paths
 my $path = abs_path(dirname(__FILE__));
 $path =~ s/\/bin//;
@@ -166,7 +172,7 @@
 my $blast_prog = 'blastp';
 my $outputfmt = 'blastxml';
 my $eval_blast_query = 0.0001;
-my $filter = 'T';
+my $filter = 'F'; # default for blastp
 my $annotation_prog = "annoFAS";
 my $fas_prog = "calcFAS";
 my $fdogFAS_prog = "fdogFAS";
@@ -197,6 +203,7 @@
 my $idx_dir = "$path/taxonomy/";
 my $dataDir = $path . '/data';
 my $weightPath = "$path/weight_dir/";
+my $assembly_dir = "$path/assembly_dir/";
 
 my @defaultRanks = (
 	'superkingdom', 'kingdom',
@@ -300,6 +307,15 @@
 my %hashTree;
 my $aln = 'muscle';
 my $searchTaxa;
+#variables for fdog_goes_assembly
+my $assembly;
+my $augustusRefSpec;
+my $avIntron;
+my $lengthExtension;
+my $assemblyPath;
+my $searchTool = 'blast';
+my $matrix = 'blosum62';
+my $dataPath = '';
 ################# Command line options
 GetOptions (
 	"h"                 => \$help,
@@ -361,7 +377,15 @@
 	"distDeviation=s"	=> \$distDeviation,
 	"aligner=s"	=> \$aln,
 	"hyperthread" => \$hyperthread,
-	"searchTaxa=s" => \$searchTaxa
+	"searchTaxa=s" => \$searchTaxa,
+	"assembly" => \$assembly,
+	"assemblypath=s" => \$assemblyPath,
+	"augustusRefSpec=s" => \$augustusRefSpec,
+	"avIntron=s" => \$avIntron,
+	"lengthExtension=s" => \$lengthExtension,
+	"searchTool=s" => \$searchTool,
+	"scoringmatrix=s" => \$matrix,
+	"dataPath=s" => \$dataPath
 );
 
 $outputPath = abs_path($outputPath);
@@ -373,6 +397,8 @@
 $weightPath = abs_path($weightPath)."/";
 $genome_dir = abs_path($genome_dir)."/";
 $taxaPath = $genome_dir;
+$dataPath = abs_path($dataPath)."/";
+$assembly_dir = abs_path($assemblyPath)."/";
 
 ############# do initial check
 if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) {
@@ -381,7 +407,7 @@
 	initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff);
 	print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n";
 
-	if (!defined $coreex) {
+	if (!defined $coreex && !defined $assembly) {
 		if (!grep(/$minDist/, @defaultRanks)) {
 			die "ERROR: minDist $minDist invalid!\n";
 		}
@@ -464,7 +490,7 @@
 
 # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction)
 # get annotations for seed sequence if fas support is on
-if ($fas_support){
+if ($fas_support && !$assembly){
 	if (!$weightPath) {
 		createWeightFolder();
 	}
@@ -473,7 +499,7 @@
 
 my $coreStTime = gettime(); #time;
 #core-ortholog search
-if (!$coreex) {
+if (!$coreex && !$assembly) {
 	print "\nCore compiling...\n";
 	$coremode = 1;
 	$taxaPath = $blastPath;
@@ -562,11 +588,14 @@
 			}
 		}
 		printDebug("The maximum alignmentscore is: $maxAlnScore");
+		if ($maxAlnScore == 0) {
+			die("Maximum alignment score is Zero! Something went wrong with fasta36 functions!\n")
+		}
 		clearTmpFiles();
 
 		my $addedTaxon = getBestOrtholog();
 		my $addedTaxonName = getTaxonName($addedTaxon);
-		print "Added TAXON: $addedTaxon\_$addedTaxonName\n";
+		print "Added TAXON: $addedTaxon\t$addedTaxonName\n";
 		#if a new core ortholog was found
 		if($addedTaxon ne "") {
 			$hamstrSpecies = $hamstrSpecies . "," . $addedTaxon;
@@ -608,12 +637,17 @@
 	my $final_eval_blast = $eval_blast*$eval_relaxfac;
 	my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac;
 
-	$taxaPath = $genome_dir;
+	if (!$assembly){
+		$taxaPath = $genome_dir;
+	}
+	else{
+		$taxaPath = $assembly_dir;
+	}
 	my @searchTaxa;
-	unless($groupNode) {
-		@searchTaxa = keys %taxa;
-	} else {
-		unless ($searchTaxa) {
+	unless ($searchTaxa) {
+		unless($groupNode) {
+			@searchTaxa = keys %taxa;
+		} else {
 			# %taxa = getTaxa();
 			# print "GET TAXA TIME: ", roundtime(gettime() - $startTmp),"\n";
 			my $tree = getTree();
@@ -629,11 +663,11 @@
 			foreach (get_leaves($tree)) {
 				push(@searchTaxa, @{$_->name('supplied')}[0]);
 			}
-		} else {
-			open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n";
-			@searchTaxa = <SEARCH>;
-			close (SEARCH);
 		}
+	} else {
+		open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n";
+		@searchTaxa = <SEARCH>;
+		close (SEARCH);
 	}
 	# print "PREPARE TIME: ", roundtime(gettime() - $startTmp),"\n";
 
@@ -645,15 +679,82 @@
 	foreach (sort @searchTaxa) {
 		chomp(my $searchTaxon = $_);
 		my $pid = $pm->start and next;
+		if ($coreex) {
+			$db = Bio::DB::Taxonomy->new(-source    => 'flatfile',
+				-nodesfile => $idx_dir . 'nodes.dmp',
+				-namesfile => $idx_dir . 'names.dmp',
+				-directory => $idx_dir);
+			$db_bkp = $db;
+		}
 		my $searchTaxonName = getTaxonName($searchTaxon);
 		if (defined($searchTaxonName)) {
 			unless ($silent) {
 				print $searchTaxon, "\t", $searchTaxonName, "\n";
 			} else {
-				print $searchTaxonName, "\n";
+				unless ($searchTaxonName eq "Unk") {
+					print $searchTaxonName, "\n";
+				} else {
+					print $searchTaxon, "\n";
+				}
+			}
+		}
+		if ($assembly){
+			$eval_blast = sprintf("%f", $eval_blast);
+			if ($seqFile ne "") {
+				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
+
+				if (defined $assemblyPath){
+					push(@assembly_cmd, "--assemblyPath $assemblyPath")
+				}
+				if (defined $avIntron){
+					push(@assembly_cmd, "--avIntron $avIntron ");
+				}
+				if (defined $lengthExtension){
+					push(@assembly_cmd, "--lengthExtension $lengthExtension ");
+				}
+				if (!$autoclean){
+					push(@assembly_cmd, "--tmp ");
+				}
+				if ($outputPath){
+					push(@assembly_cmd, "--out $outputPath ");
+				}
+				if (defined $strict){
+					push(@assembly_cmd, "--strict");
+				}
+				if ($eval_blast){
+					push(@assembly_cmd, "--evalBlast $eval_blast ");
+				}
+				if ($searchTool){
+					push(@assembly_cmd, "--msaTool $aln ");
+				}
+				if (defined $checkcoorthologsref){
+					push(@assembly_cmd, "--checkCoorthologsRef");
+				}
+				if ($searchTool){
+					push(@assembly_cmd, "--searchTool $searchTool");
+				}
+				if ($matrix){
+					push(@assembly_cmd, "--scoringmatrix $matrix");
+				}
+				if ($coreOrthologsPath){
+					push(@assembly_cmd, "--coregroupPath $coreOrthologsPath");
+				}
+				if ($fasoff){
+					push(@assembly_cmd, "--fasoff");
+				}
+				if ($searchTaxon){
+					push(@assembly_cmd, "--searchTaxon $searchTaxon");
+				}
+				if ($filter){
+					push(@assembly_cmd, "--filter $filter");
+				}
+				printDebug(@assembly_cmd);
+				system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n";
 			}
 		}
+		else{
 		runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln);
+		}
 		$pm->finish;
 	}
 	$pm->wait_all_children;
@@ -661,8 +762,8 @@
 push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!";
 print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n";
 
-## Evaluation of all orthologs that are predicted by the final run
-if(!$coreOnly){
+
+if(!$coreOnly && !$assembly){
 	my $fasStTime = gettime();
 	my $processID = $$;
 
@@ -671,10 +772,10 @@
 		die "ERROR: Could not find $finalOutput\n";
 	}
 	# check and add seed to final extended.fa if needed
-	addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # BLABLABLABLA
+	addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
 
 	# calculate FAS scores for final extended.fa
-	if ($fas_support) {
+	if ($fas_support && !$assembly) {
 		print "Starting the feature architecture similarity score computation...\n";
 		my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu";
 		unless ($countercheck) {
@@ -687,12 +788,21 @@
 	}
 	push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
 	print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
+
 	if($autoclean){
 		print "Cleaning up...\n";
 		runAutoCleanUp($processID);
 	}
 }
 
+if ($assembly){
+	my $file_assembly_out;
+	$file_assembly_out = $outputPath . '/' . $seqName;
+	my $cmd_merge;
+	$cmd_merge = "fdog.mergeAssembly --in  $outputPath --out  $file_assembly_out --cleanup";
+	printDebug($cmd_merge);
+	system($cmd_merge);
+}
 ## Delete tmp folder
 unless ($debug) {
 	my $delTmp = "rm -rf $tmpdir";
@@ -721,8 +831,12 @@ sub clearTmpFiles {
 	}
 
 	#clear all alignment files
-	my @files = glob("*.scorefile");
-	foreach my $file (@files) {
+	my @scorefiles = glob("*.scorefile");
+	foreach my $file (@scorefiles) {
+		unlink($file);
+	}
+	my @fastaInfiles = glob("*_fasta36.fa");
+	foreach my $file (@fastaInfiles) {
 		unlink($file);
 	}
 }
@@ -761,21 +875,19 @@ sub getCandicontent{
 sub getCumulativeAlnScores{
 	chdir($coreOrthologsPath . $seqName);
 	my $candidatesFile = $outputFa . ".extended";
-	my $scorefile = $$ . ".scorefile";
+	my $fileId = $$;
+	my $scorefile = $fileId . ".scorefile";
+	my $fasta36file1 = $fileId . ".1_fasta36.fa";
+	my $fasta36file2 = $fileId . ".2_fasta36.fa";
 	my %scores;
+
 	########################
 	## step: 1
-	## setup
-	## set alignment command (glocal, local, or global)
-	#local      local:local    ssearch36   Smith-Waterman
-	#glocal     global:local   glsearch36  Needleman-Wunsch
-	#global     global:global  ggsearch36  Needleman-Wunsch
-	my $loclocCommand = "$localaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
-	my $globlocCommand = "$glocalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
-	my $globglobCommand = "$globalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
+	## set alignment parameters for fasta36
+	my $fasta36cmd = $fasta36file1 . "\" \"" . $fasta36file2 . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
+
 	########################
 	## step: 2
-	## setup
 	## candidates to hash
 	## %candicontent keeps info about all candidates (header and sequence)
 	my %candicontent = getCandicontent();
@@ -784,11 +896,25 @@ sub getCumulativeAlnScores{
 	## step: 3
 	## get alignment scores
 	chdir($coreOrthologsPath . $seqName);
+	symlink($outputFa, $fasta36file1);
+	symlink($candidatesFile, $fasta36file2);
 	if ($glocal){
+		#glocal     global:local   glsearch36  Needleman-Wunsch
+		my $globlocCommand = "$glocalaligner \"" . $fasta36cmd;
+		printDebug($globlocCommand);
+		# print $globlocCommand,"\n";<>;
 		system($globlocCommand);
 	}elsif ($global){
+		#global     global:global  ggsearch36  Needleman-Wunsch
+		my $globglobCommand = "$globalaligner \"" . $fasta36cmd;
+		printDebug($globglobCommand);
+		# print $globglobCommand,"\n";<>;
 		system($globglobCommand);
 	}elsif ($local){
+		#local      local:local    ssearch36   Smith-Waterman
+		my $loclocCommand = "$localaligner \"" . $fasta36cmd;
+		printDebug($loclocCommand);
+		# print $loclocCommand,"\n";<>;
 		system($loclocCommand);
 	}
 	########################
@@ -806,49 +932,7 @@ sub getCumulativeAlnScores{
 ## Get the alinment scores for the current candidate file
 sub getAlnScores{
 	chdir($coreOrthologsPath . $seqName);
-	my $candidatesFile = $outputFa . ".extended";
-	my $scorefile = $$ . ".scorefile";
-	my %scores;
-
-	########################
-	## step: 1
-	## setup
-	## set alignment command (glocal, local, or global)
-	#local      local:local    ssearch36   Smith-Waterman
-	#glocal     global:local   glsearch36  Needleman-Wunsch
-	#global     global:global  ggsearch36  Needleman-Wunsch
-	my $loclocCommand = "$localaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
-	my $globlocCommand = "$glocalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
-	my $globglobCommand = "$globalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile;
-
-	########################
-	## step: 2
-	## setup
-	## candidates to hash
-	## %candicontent keeps info about all candidates (header and sequence)
-	my %candicontent = getCandicontent();
-
-	########################
-	## step: 3
-	## get alignment scores
-	chdir($coreOrthologsPath . $seqName);
-	if ($glocal){
-		system($globlocCommand);
-	}elsif ($global){
-		system($globglobCommand);
-	}elsif ($local){
-		system($loclocCommand);
-	}
-
-	########################
-	## step: 4
-	## collect alignment score
-	## keep track about min and max for each query/coreortholog vs candidate set
-	my $max = -10000000;
-	my $min = 10000000;
-
-	%scores = cumulativeAlnScore($scorefile, \%candicontent);
-
+	my %scores = getCumulativeAlnScores();
 	## Normalize Alignment scores (unity-based)
 	printDebug("Normalize alignment scores:\n");
 	foreach my $key (keys %scores){
@@ -885,8 +969,8 @@ sub getFasScore{
 	## step: 2
 	## get FAS score
 	## fas support: on/off
+	my @candidateIds = keys(%candicontent);
 	if ($fas_support){
-		my @candidateIds = keys(%candicontent);
 		my ($name,$gene_set,$gene_id,$rep_id) = split(/\|/, $candidateIds[0]);
 		unless (-e "$weightPath/$gene_set.json") {
 			print "ERROR: $weightPath/$gene_set.json not found! FAS Score will be set as zero.\n";
@@ -898,6 +982,8 @@ sub getFasScore{
 			my @fasOutTmp = split(/\t/,$fasOutTmp);
 			$fas_box{$candidateIds[0]} = $fasOutTmp[1];
 		}
+	} else {
+		$fas_box{$candidateIds[0]} = 1;
 	}
 	return %fas_box;
 }
@@ -1123,10 +1209,10 @@ sub checkOptions {
 	if ($force == 1 and $append ==1) {
 		$force = 0;
 	}
-	### check the presence of the pre-computed core set
-	if ($coreex) {
+	### check the presence of the pre-computed core set if options reuseCore or assembly is used
+	if ($coreex || $assembly) {
 		if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") {
-			print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
+			print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
 			exit;
 		}
 	}
@@ -1155,7 +1241,7 @@ sub checkOptions {
 	### end move up
 	### adding new routine to generate the input sequence if -reuseCore has been set
 	if ($coreex) {
-		my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa`;
+		my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa | grep -v "^\-\-\$"`;
 		chomp @refseq;
 		unless ($silent) {
 			print "$refseq[0]\n";
@@ -1197,7 +1283,7 @@ sub checkOptions {
 
 	### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected
 	$optbreaker = 0;
-	while(!$minCoreOrthologs and !$coreex) {
+	while(!$minCoreOrthologs and (!$coreex and !$assembly)) {
 		if ($optbreaker >= 3){
 			print "No proper number given ... exiting.\n";
 			exit;
@@ -1212,10 +1298,12 @@ sub checkOptions {
 		$filter = 'no' if $filter eq 'F';
 	}
 
-	$inputSeq = fetchSequence($seqFile, $dataDir);
+	if (!$assembly){
+		$inputSeq = fetchSequence($seqFile, $dataDir);
+	}
 
 	## the user has not provided a sequence id, however, the refspec is determined.
-	if($seqId eq '') {
+	if($seqId eq '' && !$assembly) {
 		my $besthit;
 		if (!$blast){
 			## a refspec has been determined
@@ -1230,6 +1318,9 @@ sub checkOptions {
 		$refSpec = $besthit->{species};
 		my $details = "Evalue: " . $besthit->{evalue};
 		printOut("Seq id has been determined as $seqId in $refSpec with $details", 2);
+		if(length("$seqName|$refSpec|$seqId") > 60) {
+			die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n";
+		}
 		if($seqId eq '') {
 			print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n";
 			exit;
@@ -1241,13 +1332,13 @@ sub checkOptions {
 			print "Please specify a valid file with taxa for the core orthologs search\n";
 			exit;
 		}
-		my @userTaxa = parseTaxaFile();
+		my @userTaxa = parseTaxaFile($coreTaxa);
 		my %newTaxa = ();
 		foreach (@userTaxa) {
 			$newTaxa{$_} = $taxa{$_};
 		}
 		$newTaxa{$refSpec} = $refTaxa{$refSpec};
-		%taxa = %newTaxa;
+		%refTaxa = %newTaxa;
 	}
 
 	if($group) {
@@ -1334,14 +1425,14 @@ sub checkOptions {
 		}
 	}
 
-	my $node;
-	$node = $db->get_taxon(-taxonid => $refTaxa{$refSpec});
-	$node->name('supplied', $refSpec);
-
 	#### checking for the min and max distance for the core set compilation
 	#### omit this check, if the option reuseCore has been selected (added 2019-02-04)
 	$optbreaker = 0;
-	if (!$coreex) {
+	if (!$coreex and !$assembly) {
+		my $node;
+		#print "Testing coreex assembly\n";
+		$node = $db->get_taxon(-taxonid => $refTaxa{$refSpec});
+		$node->name('supplied', $refSpec);
 		if (lc($maxDist) eq "root"){
 			$maxDist = 'no rank';
 		}
@@ -1357,9 +1448,6 @@ sub checkOptions {
 			$maxDist = parseInput($node, $in);
 			print "You selected ". $maxDist . " as maximum rank\n\n";
 		}
-	}
-	$optbreaker = 0;
-	if (!$coreex){
 		while (!$minDist or (checkRank($minDist, $node) == 0)) {
 			if ($optbreaker >= 3){
 				print "No proper minDist given ... exiting.\n";
@@ -1373,6 +1461,7 @@ sub checkOptions {
 			print "You selected " . $minDist . " as minimum rank\n\n";
 		}
 	}
+	$optbreaker = 0;
 
 	#### checking in fas options
 	if($fasoff){
@@ -1596,8 +1685,9 @@ sub getBestOrtholog {
 					## candidates alnScore is high enought, that it would be better with a fasScore of one
 					## -> evaluate
 					if ($alnScores{$candiKey} > $rankScore * (1 + $distDeviation) - 1){
+						%fas_box = getFasScore();
 						if (!$gotFasScore and $fas_support){
-							%fas_box = getFasScore();
+							# %fas_box = getFasScore();
 							$gotFasScore = 1;
 						}
 						## get rankscore
@@ -1622,8 +1712,9 @@ sub getBestOrtholog {
 				}
 				## candidate has the same distance, as the last one and could be better, with a fasScore of one
 				elsif (defined $hashTree{$newNoRankDistNode}{$key->id} and $alnScores{$candiKey} > $rankScore - 1){
+					%fas_box = getFasScore();
 					if (!$gotFasScore and $fas_support){
-						%fas_box = getFasScore();
+						# %fas_box = getFasScore();
 						$gotFasScore = 1;
 					}
 					## get rankscore
@@ -1909,7 +2000,7 @@ sub getTaxonName {
 	if (defined($taxon)) {
 		return($taxon->scientific_name);
 	} else {
-		return("Unk NCBI taxon for $taxAbbr");
+		return("Unk");
 	}
 }
 
@@ -2008,6 +2099,7 @@ sub runHamstr {
 						print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n";
 					}
 				}
+				# addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa);
 			} else {
 				# add seed sequence to output extended.fa if no ortholog was found in refSpec
 				if ($taxon eq $refSpec) {
@@ -2054,11 +2146,13 @@ sub addSeedSeq {
 	# get seed sequence and add it to the beginning of the fasta output
 	open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n";
 	my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta');
+	my %idTmp; # used to check which seq has already been written to output
 	while(my $seq = $seqio->next_seq) {
 		my $id = $seq->id;
 		if ($id =~ /$refSpec/) {
+			$idTmp{"$id|1"} = 1;
 			print TEMP ">$id|1\n", $seq->seq, "\n";
-			last;
+			#last;
 		}
 	}
 	# then write other sequences
@@ -2066,7 +2160,9 @@ sub addSeedSeq {
 	while(my $seq = $seqio2->next_seq) {
 		my $id = $seq->id;
 		unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) {
-			print TEMP ">$id\n", $seq->seq, "\n";
+			unless ($idTmp{$id}) {
+				print TEMP ">$id\n", $seq->seq, "\n";
+			}
 		}
 	}
 	close(TEMP);
@@ -2096,17 +2192,19 @@ sub parseInput {
 }
 ##########################
 sub parseTaxaFile {
-	open (INPUT, "<$coreTaxa") or die print "Error opening file with taxa for core orthologs search\n";
+	my $coreTaxaFile = $_[0];
+	open (INPUT, "<$coreTaxaFile") or die print "Error opening file with taxa for core orthologs search\n";
 	my @userTaxa;
 	while(<INPUT>) {
 		my $line = $_;
 		chomp($line);
-		if(!$taxa{$line}) {
-			print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n";
-			exit;
-		}
-		else {
-			push(@userTaxa, $line);
+		if (length($line) > 0) {
+			if(!$taxa{$line}) {
+				print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n";
+				exit;
+			} else {
+				push(@userTaxa, $line);
+			}
 		}
 	}
 	close INPUT;
@@ -2592,7 +2690,7 @@ sub initialCheck {
 		}
 	}
 	# check weight_dir
-	if ($fasoff != 1) {
+	if ($fasoff != 1 && !$assembly) {
 		my %seen;
 		my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir);
 		chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`);
@@ -2607,7 +2705,7 @@ sub initialCheck {
 
 sub getGenomeFile {
 	my ($folder, $filename) = @_;
-	chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.tmp\"`);
+	chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.mapping\\|\\.tmp\"`);
 	my $out = $faFile;
 	chomp(my $link = `$readlinkprog -f $faFile`);
 	if ($link ne "") {
@@ -2641,23 +2739,23 @@ sub checkValidFolderName {
 ###########################
 sub helpMessage {
 	my $helpmessage = "
-${bold}YOU ARE RUNNING $version on $hostname$norm
+YOU ARE RUNNING $version on $hostname
 
 This program is freely distributed under a GPL.
 Copyright (c) GRL limited: portions of the code are from separate copyrights
 
-\n${bold}USAGE:${norm} oneSeq.pl -seqFile=<> -seqId=<>  -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS]
+\nUSAGE: oneSeq.pl -seqFile=<> -seqId=<>  -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS]
 
-${bold}OPTIONS:$norm
+OPTIONS:
 
-${bold}GENERAL$norm
+GENERAL
 
 -h
 	Invoke this help method
 -version
 	Print the program version
 
-${bold}REQUIRED$norm
+REQUIRED
 
 -seqFile=<>
 	Specifies the file containing the seed sequence (protein only) in fasta format.
@@ -2677,7 +2775,7 @@ sub helpMessage {
 -coreOrth=<>
 	Specify the number of orthologs added to the core set.
 
-${bold}USING NON-DEFAULT PATHS$norm
+USING NON-DEFAULT PATHS
 
 -outpath=<>
 	Specifies the path for the output directory. Default is $outputPath;
@@ -2690,7 +2788,7 @@ sub helpMessage {
 -weightpath=<>
 	Specifies the path for the pre-calculated feature annotion directory. Default is $weightPath;
 
-${bold}ADDITIONAL OPTIONS$norm
+ADDITIONAL OPTIONS
 
 -append
 	Set this flag to append the output to existing output files
@@ -2777,7 +2875,7 @@ sub helpMessage {
 	Set the alignment strategy during core ortholog compilation to glocal.
 -searchTaxa
 	Input file containing list of search taxa.
-${bold}SPECIFYING FAS SUPPORT OPTIONS$norm
+SPECIFYING FAS SUPPORT OPTIONS
 
 -fasoff
 	Turn OFF FAS support. Default is ON.
@@ -2790,7 +2888,7 @@ sub helpMessage {
 -countercheck
 	Set this flag to counter-check your final profile. The FAS score will be computed in two ways (seed vs. hit and hit vs. seed).
 
-${bold}SPECIFYING EXTENT OF OUTPUT TO SCREEN$norm
+SPECIFYING EXTENT OF OUTPUT TO SCREEN
 
 -debug
 	Set this flag to obtain more detailed information about the programs actions
diff --git a/fdog/checkData.py b/fdog/checkData.py
index 59256bc..84310ac 100644
--- a/fdog/checkData.py
+++ b/fdog/checkData.py
@@ -70,6 +70,12 @@ def checkValidFasta(file):
         fasta = SeqIO.parse(f, 'fasta')
         if not any(fasta):
             return('notFasta')
+        else:
+            # check for long header
+            inSeq = SeqIO.to_dict((SeqIO.parse(open(file), 'fasta')))
+            for id in inSeq:
+                if len(id) > 30:
+                    return('longHeader')
         # check space or tab
         if any(s in f.read() for s in spaceChr):
             return('space')
@@ -90,6 +96,7 @@ def checkValidSeqs(faFile):
     faSeq = SeqIO.parse(open(faFile),'fasta')
     for fa in faSeq:
         id, seq = fa.description, str(fa.seq)
+        c = ''
         if any(e in id for e in spaceChr):
             sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, faFile))
         if any(c for c in seq if not c.isalpha()):
@@ -131,6 +138,8 @@ def checkDataFolder(checkDir, replace, delete, concat):
                             checkFaFile = checkValidFasta(faFile)
                             if checkFaFile == 'notFasta':
                                 sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile)
+                            elif checkFaFile == 'longHeader':
+                                sys.exit('*** ERROR: %s contains long headers!' % faFile)
                             elif checkFaFile == 'space':
                                 sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile)
                             elif checkFaFile == 'multiLine':
@@ -184,7 +193,7 @@ def checkMissingNcbiID(namesDmp, taxaList):
     return(missingTaxa.keys(), dupTaxa)
 
 def main():
-    version = '0.0.2'
+    version = '0.0.3'
     parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.')
     parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='')
     parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='')
diff --git a/fdog/data/.DS_Store b/fdog/data/.DS_Store
index fde072a6aebc6f6618808f5bbd3cd63c202098d1..bf1ded6ef3f07fae44d0ee29b918a7b6e62c579b 100644
GIT binary patch
delta 166
zcmZp1XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jGrVU^g=($7CLXsf?#4TM0L4
zNmf@I8kp!P7#bVa>L^qj8i6?GCMLDDoE+k+hPIvwxs_GbHMMoKCQlHOXPiAbTTpxR
zRiQq{iKUMgvvY6=G6T&A0s(Fy;R>>9W8rt^$^0^&Ad4B8AdUdJi(zv-&m3j|>H;Ig

delta 201
zcmZoMXmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA$h|ROH}hr%jz7$c**Q2SHn1>q
zPv&8nI{7#2*~$5=8m^oSNenp*i44UIB@FQlDGZqmMV>kN$w@i+Ng!i@hW`Z8djG)y
z$YNl?qGR$0)*IYN;+q>-mojcn=2^|mCBY5U;tF!mW<iec%#-;=JSY3}aBwg}+{&;y
Io@Wj-0Lzpw)c^nh

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
new file mode 100644
index 0000000..de9f343
--- /dev/null
+++ b/fdog/fDOGassembly.py
@@ -0,0 +1,837 @@
+############################ imports ###########################################
+import os
+import os.path
+import sys
+from Bio import SeqIO
+from Bio.Phylo.TreeConstruction import DistanceCalculator
+from Bio import AlignIO
+import argparse
+import yaml
+import subprocess
+########################### functions ##########################################
+def load_config(config_file):
+    with open(config_file, 'r') as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+def starting_subprocess(cmd, mode):
+    if mode == 'debug':
+        result = subprocess.run(cmd, shell=True)
+    elif mode == 'silent':
+        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
+    elif mode == 'normal':
+        result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
+
+def merge(blast_results, insert_length):
+    #merging overlapping and contigous candidate regions
+    number_regions = 0
+    insert_length = int(insert_length)
+    for key in blast_results:
+        locations = blast_results[key]
+        locations = sorted(locations, key = lambda x: int(x[3]))
+        #print("test")
+        #print(locations)
+        size_list = len(locations)
+        j = 0
+        while j < size_list-1:
+            i = j + 1
+            while i < size_list:
+                if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
+                    #merge overlapping regions plus strand
+                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -= 1
+                elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                    #merge overlapping regions minus strand
+                    locations[j][0] = min(locations[j][0], locations[i][0])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -= 1
+                elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
+                    #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand
+                    locations[j][1] = max(locations[j][1], locations[i][1])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -=1
+                elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+                    #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand
+                    locations[j][0] = min(locations[j][0], locations[i][0])
+                    locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations.pop(i)
+                    size_list -= 1
+                    i -=1
+                i += 1
+            j += 1
+
+        number_regions += len(locations)
+        blast_results[key] = locations
+
+    return blast_results, number_regions
+
+def parse_blast(line, blast_results, cutoff):
+    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend>
+    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
+    line = line.replace("\n", "")
+    line_info = line.split("\t")
+    evalue = float(line_info[3])
+    #cut off
+    if evalue > cutoff:
+        return blast_results, evalue
+    #add region to dictionary
+    else:
+        node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5])
+        split = node_name.split("|")
+        # finding out on which strand tBLASTn found a hit
+        if sstart < send:
+            strand = "+"
+        else:
+            sstart = int(line_info[2])
+            send = int(line_info[1])
+            strand = "-"
+        #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off
+        if len(split) > 1:
+            node_name = split[1]
+        if node_name in blast_results:
+            list = blast_results[node_name]
+            list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand])
+            blast_results[node_name] = list
+        else:
+            blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]]
+
+    return blast_results, evalue
+
+def candidate_regions(intron_length, cutoff_evalue, tmp_path):
+    ###################### extracting candidate regions ########################
+    # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
+    blast_file = open(tmp_path + "/blast_results.out", "r")
+    evalue = 0
+    blast_results = {}
+    #parsing blast output
+    while True:
+        line = blast_file.readline()
+        #end of file is reached
+        if not line:
+            break
+        #parsing blast output
+        blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
+
+    if blast_results == {}:
+        return 0,0
+    else:
+        candidate_regions, number_regions = merge(blast_results, intron_length)
+
+        return candidate_regions, number_regions
+
+def extract_seq(region_dic, path, tmp_path, mode):
+
+    for key in region_dic:
+        #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
+        cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
+        starting_subprocess(cmd, mode)
+
+def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode):
+    output = open(candidatesOutFile, "w")
+
+    for key in regions:
+        locations = regions[key]
+        counter = 0
+        for i in locations:
+            # some variables
+            counter += 1
+            start = str(i[0] - length_extension)
+            end = str(i[1] + length_extension)
+            name = key + "_" + str(counter)
+            # augutus call
+            cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
+            #print(cmd)
+            starting_subprocess(cmd, 'silent')
+            # transfer augustus output to as sequence
+            cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
+            starting_subprocess(cmd, mode)
+            # parsing header and sequences
+            try:
+                sequence_file = open(tmp_path + name + ".aa", "r")
+                lines = sequence_file.readlines()
+                for line in lines:
+                    if line[0] == ">":
+                        id = line.replace(">", "")
+                        header = ">" + group + "|" + ass_name + "|" + name + "_" + id
+                        output.write(header)
+                    else:
+                        output.write(line)
+                sequence_file.close()
+            except FileNotFoundError:
+                print("No gene found in region with ID:" + name + " , continuing with next region")
+    output.close()
+
+def searching_for_db(assembly_path):
+
+    db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto']
+    check = True
+    for end in db_endings:
+        check = check and os.path.exists(assembly_path + end)
+    return check
+
+def get_distance_biopython(file, matrix):
+    aln = AlignIO.read(open(file), 'fasta')
+    calculator = DistanceCalculator(matrix)
+    dm = calculator.get_distance(aln)
+    return dm
+
+def readFasta(candidatesOutFile):
+    seq_records = SeqIO.parse(candidatesOutFile, "fasta")
+    return seq_records
+
+def getSeedInfo(path):
+    dic = {}
+    seq_records = readFasta(path)
+    for entry in seq_records:
+        species = entry.id.split("|")[1]
+        geneID = entry.id.split("|")[2]
+
+        try:
+            dic[species].append(geneID)
+        except KeyError:
+            dic[species] = [geneID]
+
+    del seq_records
+    return dic
+
+def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path):
+    ###########getting sequences and write all in one file to make msa #########
+    name_file = candidate_name + ".co"
+    output_file = tmp_path + name_file + '.fasta'
+    aln_file = tmp_path + name_file + '.aln'
+    genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species)
+    #print(searchTool)
+
+    out = open(output_file, "w")
+    inSeq = SeqIO.to_dict((SeqIO.parse(open(genome_dir_path), 'fasta')))
+    out.write(">" + best_hit + "\n")
+    out.write(str(inSeq[best_hit].seq) + "\n")
+    out.write(">" + ref + "\n")
+    out.write(str(inSeq[ref].seq )+ "\n")
+
+    candidates = readFasta(candidatesOutFile)
+    for record in candidates:
+        if candidate_name in record.id:
+            out.write(">" + candidate_name + "\n")
+            out.write(str(record.seq) + "\n")
+            break
+
+    out.close()
+
+    if msaTool == "muscle":
+        os.system("muscle -quiet -in " + output_file + " -out " + aln_file)
+        #print("muscle -quiet -in " + output_file + " -out " + aln_file)
+    elif msaTool == "mafft-linsi":
+        #print("mafft-linsi")
+        os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
+
+    distances = get_distance_biopython(aln_file, matrix)
+
+    distance_hit_query = distances[best_hit, candidate_name]
+    distance_ref_hit = distances[best_hit, ref]
+
+    if distance_ref_hit < distance_hit_query:
+        #accepted
+        return 1, distance_ref_hit, distance_hit_query
+
+    else:
+        #rejected
+        return 0, distance_ref_hit, distance_hit_query
+
+def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode):
+    # the backward search uses the genes predicted from augustus and makes a blastp search
+    #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa
+    seedDic = getSeedInfo(fasta_path)
+    #print(fasta_path)
+    orthologs = []
+    #print(seedDic)
+    blast_dir_path = dataPath + "/blast_dir/"
+    if strict != True:
+        seed = [fdog_ref_species]
+        try:
+            id_ref = seedDic[fdog_ref_species]
+        except KeyError:
+            print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
+            return 0, seed
+        if searchTool == "blast":
+            cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+            starting_subprocess(cmd, mode)
+        else:
+            print("diamonds are the girls best friends")
+            ##### diamond call
+
+        alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r")
+        lines = alg_file.readlines()
+        alg_file.close()
+        old_name = None
+        min = 10
+        for line in lines:
+            id, gene, evalue = (line.replace("\n", "")).split("\t")
+            gene_name = gene.split("|")[2]
+            if gene_name != old_name:
+                print("candidate:%s"%(gene_name))
+                print("blast-hit:%s"%(id))
+                min = float(evalue)
+                if id in id_ref:
+                    orthologs.append(gene)
+                    print("\thitting\n")
+                else:
+                    if checkCo == True:
+                        for i in id_ref:
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
+                            if co_orthologs_result == 1:
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                orthologs.append(gene)
+                            elif co_orthologs_result == 0:
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                    else:
+                        print("\tnothitting\n")
+            elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs:
+                if id in id_ref:
+                    orthologs.append(gene)
+                    print("\thitting\n")
+                else:
+                    if checkCo == True:
+                        for i in id_ref:
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
+                            if co_orthologs_result == 1:
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                orthologs.append(gene)
+                            elif co_orthologs_result == 0:
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                    else:
+                        print("\tnot hitting\n")
+            old_name = gene_name
+
+
+        if orthologs == []:
+            print("No hit in the backward search, ...exciting")
+            return 0, seed
+
+    else:
+        if taxa != []:
+            seed = taxa
+            try:
+                i = seed.index(fdog_ref_species)
+                seed.insert(0,seed.pop(i))
+            except ValueError:
+                seed.insert(0,fdog_ref_species)
+            #print(seed)
+            #print("with taxa list from user input")
+
+        else:
+            seed = []
+            for key in seedDic:
+                if key == fdog_ref_species:
+                    seed.insert(0,key)
+                else:
+                    seed.append(key)
+
+        orthologs = set({})
+
+        for species in seed:
+            print("backward search in species " + species + "\n")
+            orthologs_new = set({})
+            try:
+                id_ref = seedDic[species]
+            except KeyError:
+                print("The species " + species + " isn't part of the core ortholog group, ... exciting")
+                return 0, seed
+
+            cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+            starting_subprocess(cmd, mode)
+            alg_file = open(tmp_path + "/blast_" + species, "r")
+            lines = alg_file.readlines()
+            alg_file.close()
+            old_name = None
+            min = 10
+            for line in lines:
+                id, gene_name, evalue = (line.replace("\n", "")).split("\t")
+                if gene_name != old_name:
+                    min = float(evalue)
+                    if id in id_ref:
+                        orthologs_new.add(gene_name)
+
+                elif (gene_name == old_name) and float(evalue) == min:
+                    if id in id_ref:
+                        orthologs_new.add(gene_name)
+
+            #print(species)
+            #print(orthologs_new)
+            if species == fdog_ref_species:
+                orthologs = orthologs_new
+            else:
+                orthologs = orthologs & orthologs_new
+                if orthologs == {}:
+                    print("No ortholog was found with option --strict")
+                    return 0, seed
+
+
+
+    #print(orthologs)
+    orthologs = set(orthologs)
+    return list(orthologs), seed
+
+def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
+
+    output_file = open(output, "a+")
+    if refBool == False:
+        seq_records_core = readFasta(core_fasta)
+        seq_records_core = list(seq_records_core)
+        for species in species_list:
+            for entry_core in seq_records_core:
+                if species in entry_core.id:
+                    output_file.write(">" + entry_core.id + "\n")
+                    output_file.write(str(entry_core.seq) + "\n")
+
+    if sequenceIds != 0:
+        seq_records_candidate = readFasta(candidate_fasta)
+        seq_records_candidate = list(seq_records_candidate)
+        for entry_candidate in seq_records_candidate:
+            if entry_candidate.id in sequenceIds:
+                if entry_candidate.id == sequenceIds[0]:
+                    output_file.write(">" + entry_candidate.id + "|1" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+                else:
+                    output_file.write(">" + entry_candidate.id + "|0" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+    output_file.close()
+    return 0
+
+def createFasInput(orthologsOutFile, mappingFile):
+    with open(orthologsOutFile, "r") as f:
+        fas_seed_id = (f.readline())[1:-1]
+        #fas_seed_id = fas_seed_id.split("|")[0]
+
+    mappingFile = open(mappingFile, "a+")
+
+    seq_records = readFasta(orthologsOutFile)
+    for seq in seq_records:
+        ncbi_id = (seq.id.split("@"))[1]
+        mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n")
+
+
+    return fas_seed_id
+
+def cleanup(tmp, tmp_path):
+    if tmp == False:
+        os.system('rm -r ' + tmp_path)
+
+def checkOptions():
+    pass
+    #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
+
+def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
+    if len(candidate_names) == 1:
+        return candidate_names
+
+    candidates = readFasta(candidatesFile)
+    ref = readFasta(fasta)
+
+    out = tmp_path + '/checkCoorthologs.fa'
+    f = open(out,"w")
+
+    aln_file = tmp_path + '/checkCoorthologs.aln'
+
+    for record in ref:
+        if fdog_ref_species in record.id:
+            ref_id = record.id
+            f.write(">" + record.id + "\n")
+            f.write(str(record.seq) +  "\n")
+            break
+
+    for record in candidates:
+        for name in candidate_names:
+            if name in record.id:
+                f.write(">" + name + "\n")
+                f.write(str(record.seq) + "\n")
+    f.close()
+
+    if msaTool == "muscle":
+        os.system("muscle -quiet -in " + out + " -out " + aln_file)
+    elif msaTool == "mafft-linsi":
+        os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file)
+
+    distances = get_distance_biopython(aln_file, matrix)
+
+    min_dist = 10
+    min_name = None
+
+    for name in candidate_names:
+        distance = distances[ref_id , name]
+        if distance <= min_dist:
+            min_dist = distance
+            min_name = name
+
+    checked = [min_name]
+
+    for name in candidate_names:
+        if name == min_name:
+            pass
+        elif distances[min_name , name] <= distances[min_name , ref_id]:
+            checked.append(name)
+
+    return checked
+
+def clean_fas(path, file_type):
+    file = open(path, "r")
+    lines = file.readlines()
+    file.close()
+    file = open(path,"w")
+
+    for line in lines:
+        if file_type == 'domains':
+            long_id, remain = line.split("#")
+            id = long_id.split("|")[0]
+            new_line = id + "#" + remain
+        else:
+            long_id, remain = line.split("\t", 1)
+            id = long_id.split("|")[0]
+            new_line = id + "\t" + remain
+
+        file.write(new_line)
+
+class Logger(object):
+    def __init__(self, file):
+        self.file = file
+        self.terminal = sys.stdout
+        self.log = self.file
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+
+    def flush(self):
+        pass
+
+
+def main():
+
+    #################### handle user input ########################################
+
+    version = '0.0.1'
+
+    parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
+    parser.add_argument('--version', action='version', version=str(version))
+
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
+                            action='store', default='', required=True)
+    required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
+    required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True)
+
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
+    optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int)
+    optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='')
+    optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False)
+    optional.add_argument('--out', help='Output directory', action='store', default='')
+    optional.add_argument('--dataPath', help='data directory', action='store', default='')
+    optional.add_argument('--coregroupPath', help='core_ortholog directory', action='store', default='')
+    optional.add_argument('--searchTool', help='Choose between blast and diamond as alignemnt search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast')
+    optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float)
+    optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False)
+    optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
+    optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False)
+    optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
+    optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='')
+    optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
+    optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
+    optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
+    optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
+    optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
+    optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
+
+
+    args = parser.parse_args()
+
+    # required
+    group = args.gene
+    augustus_ref_species = args.augustusRefSpec
+    fdog_ref_species = args.refSpec
+    #paths user input
+    assemblyDir = args.assemblyPath
+    dataPath = args.dataPath
+    core_path = args.coregroupPath
+    out = args.out
+    pathFile = args.pathFile
+    #I/O
+    tmp = args.tmp
+    strict = args.strict
+    checkCoorthologs = args.checkCoorthologsRef
+    filter = args.filter
+    if filter == True or filter == 'yes':
+        filter = 'yes'
+    else:
+        filter = 'no'
+    #others
+    average_intron_length = args.avIntron
+    length_extension = args.lengthExtension
+    searchTool = args.searchTool
+    evalue = args.evalBlast
+    msaTool = args.msaTool
+    matrix = args.scoringmatrix
+    taxa = args.coreTaxa
+    if taxa == '':
+        taxa =[]
+    else:
+        taxa = taxa.split(",")
+    fasoff = args.fasoff
+    searchTaxon = args.searchTaxon
+    silent = args.silent
+    debug = args.debug
+
+    if debug == True and silent == True:
+        print("It's not possible to use booth modes, please restart and use --debug or --silent")
+        return 1
+    else:
+        if debug == True:
+            mode = 'debug'
+        elif silent == True:
+            mode = 'silent'
+        else:
+            mode = 'normal'
+
+    #checking paths
+    if dataPath == '':
+        fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','')
+        configFile = fdogPath + '/bin/pathconfig.txt'
+        if not os.path.exists(configFile):
+            sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog) or give a dataPath')
+        if pathFile == '':
+            with open(configFile) as f:
+                dataPath = f.readline().strip()
+        else:
+            cfg = load_config(pathFile)
+            try:
+                dataPath = cfg['dataPath']
+            except:
+                dataPath = 'config'
+    if core_path == '':
+        core_path = out + '/core_orthologs/'
+    else:
+        if not core_path.endswith('/'):
+            core_path = core_path + '/'
+
+    if assemblyDir == '':
+        assemblyDir = dataPath + '/assembly_dir/'
+    if out == '':
+        #print('test out \n')
+        out = os.getcwd()
+        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+        out = out + '/' + group + '/'
+    else:
+        if out[-1] != "/":
+            out = out + "/"
+
+
+    try:
+        f = open(out + "/fdog.log", "a+")
+    except FileNotFoundError:
+        f = open(out + "/fdog.log", "w")
+
+    ################## How to handle std output and std error ##################
+
+    if mode == 'silent':
+        sys.stderr = f
+        sys.stdout = f
+    else:
+        sys.stdout = Logger(f)
+
+    # user input has to be checked here before fDOGassembly continues
+    assembly_names = os.listdir(assemblyDir)
+
+    ########################## some variables ##################################
+
+    refBool = False # checks if sequences of reference species were already part of the extended.fa file
+
+    ########### paths ###########
+
+    msa_path = core_path + "/" + group +"/"+ group + ".aln"
+    hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
+    fasta_path = core_path + "/" + group +"/"+ group + ".fa"
+    consensus_path = out + "/tmp/" + group + ".con"
+    profile_path = out + "/tmp/" + group + ".prfl"
+
+    ###################### create tmp folder ###################################
+
+    cmd = 'mkdir ' + out + '/tmp'
+    starting_subprocess(cmd, 'silent')
+
+    ######################## consensus sequence ################################
+
+    #make a majority-rule consensus sequence with the tool hmmemit from hmmer
+    print("Building a consensus sequence for gene " + group + " \n")
+    cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
+    starting_subprocess(cmd, mode)
+    print("consensus sequence is finished\n")
+
+    ######################## block profile #####################################
+
+    print("Building a block profile for gene " + group + " \n")
+    cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
+    starting_subprocess(cmd, 'silent')
+
+    if int(os.path.getsize(profile_path)) > 0:
+        print("block profile is finished \n")
+    else:
+        print("Building block profiles failed. Using prepareAlign to convert alignment\n")
+        new_path = core_path + group +"/"+ group + "_new.aln"
+        #print(cmd)
+        cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
+        starting_subprocess(cmd, mode)
+        cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
+        #print(cmd)
+        starting_subprocess(cmd, 'silent')
+        print("block profile is finished \n")
+
+    searchBool = False
+
+    #################### fDOG assembly computation for all species #############
+    for asName in assembly_names:
+        if searchBool == True:
+            break
+        if searchTaxon != '' and searchBool == False:
+            asName = searchTaxon
+            searchBool = True
+
+        ################### path definitions ###################################
+
+        cmd = 'mkdir ' + out + '/tmp/' + asName
+        starting_subprocess(cmd, 'silent')
+        tmp_path = out + "/tmp/" + asName + "/"
+        candidatesOutFile = tmp_path + group + ".candidates.fa"
+        if searchTaxon != '':
+            orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
+            fasOutFile = out + "/" + group + "_" + asName
+            mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
+        else:
+            orthologsOutFile = out + "/" + group + ".extended.fa"
+            fasOutFile = out + "/" + group
+            mappingFile = out + "/tmp/" + group + ".mapping.txt"
+
+        print("Searching in species " + asName + "\n")
+        assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
+        db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
+
+    ######################## tBLASTn ###########################################
+        #checks if data base exists already
+        db_check = searching_for_db(db_path)
+        if db_check == 0:
+            print("creating a blast data base \n")
+            cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+            starting_subprocess(cmd, mode)
+            print("database is finished \n")
+        else:
+            print('blast data base exists already, continuing...')
+
+        #makes a tBLASTn search against the new database
+        #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+        print("tBLASTn search against data base")
+        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+        starting_subprocess(cmd, mode)
+        print("tBLASTn search is finished")
+
+    ################### search for candidate regions and extract seq ###########
+    # parse blast and filter for candiate regions
+        regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+
+        if regions == 0:
+            #no candidat region are available, no ortholog can be found
+            print("No candidate region found")
+            if refBool == True:
+                continue
+            else:
+                taxa = [fdog_ref_species]
+                reciprocal_sequences = 0
+        else:
+            print(str(number_regions) + " candiate regions were found. Extracting sequences...")
+            extract_seq(regions, db_path, tmp_path, mode)
+
+    ############### make Augustus PPX search ###################################
+
+            print("starting augustus ppx \n")
+            augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+            print("augustus is finished \n")
+
+    ################# backward search to filter for orthologs###################
+            if int(os.path.getsize(candidatesOutFile)) <= 0:
+                print("No genes found at candidate regions\n")
+                if searchTaxon == '' and refBool == True:
+                    continue
+                else:
+                    reciprocal_sequences = 0
+                    taxa = [fdog_ref_species]
+            else:
+                reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+
+
+    ################## checking accepted genes for co-orthologs ################
+        if reciprocal_sequences == 0:
+            if regions != 0:
+                print("No ortholog fulfilled the reciprocity criteria")
+            if searchTaxon == '' and refBool == True:
+                continue
+            else:
+                reciprocal_sequences = 0
+        else:
+            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+
+    ################ add sequences to extended.fa in the output folder##########
+
+        addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
+        refBool = True
+
+    ############### make Annotation with FAS ###################################
+        # if we want to search in only one Taxon
+        if searchTaxon != '' and fasoff == False:
+            print("Calculating FAS scores")
+            fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
+            # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
+            cmd = 'mkdir ' + tmp_path + 'anno_dir'
+            starting_subprocess(cmd, 'silent')
+            cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
+            starting_subprocess(cmd, 'silent')
+            clean_fas(fasOutFile + "_forward.domains", 'domains')
+            clean_fas(fasOutFile + "_reverse.domains", 'domains')
+            clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
+
+
+    #if we searched in more than one Taxon and no ortholog was found
+
+    if refBool == False and searchTaxon == '':
+        print("No orthologs found. Exciting ...")
+        cleanup(tmp, tmp_path)
+        return 1
+    #if we searched in more than one taxon
+    if fasoff == False and searchTaxon == '':
+        print("Calculating FAS scores")
+        tmp_path = out + '/tmp/'
+        fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
+        # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
+        cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+        starting_subprocess(cmd, 'silent')
+        clean_fas(out + group + "_forward.domains", 'domains')
+        clean_fas(out + group + "_reverse.domains", 'domains')
+        clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+    ################# remove tmp folder ########################################
+    if searchTaxon != '':
+        cleanup(tmp, tmp_path)
+    else:
+        cleanup(tmp, out + "/tmp/")
+
+    f.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/fdog/fdog_goes_assembly/.DS_Store b/fdog/fdog_goes_assembly/.DS_Store
deleted file mode 100644
index e0e9ff1be0aa35d6ef237330e7d7dd1ba746d1ec..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%}(1u5S~p^*rck+0SS(K;TEB|ttxRqG9d_Y0?7ytv;w<HizVZYVuvV%AioB@
z1kb?}@HjBDyFt*<YZX;9*6cSsJF{zlYwZsJh}NLv0n`CNq7s&>SiB?DPC6qc=cynH
z(?<vyK3%wgXeOE+^T+_r-4gtQHtfSLyq>=VMz9V&80{tcXOw7~QTk1`P}iS~paoEd
z9Ot|DN7ViMQPD8#xxOBvUJ_?{v-v1W<;sVJMJa2t_Nn$GIM>4<5A#9R3i~HiI|%dh
z$k~TlCogsw9mU<@X?=NHr+FBs-M&eVJ6#O<ejKMAJ#6Vg+R02mfo+hoDyygU)$!PC
zer+i4i#KVg@y6$7Lw)l$CX=dMS@Smc&Mrn*<Lk-It@%t4_=_7^ayW%MG#m++K|f7&
z`Wrpd=xG!qGr$Zm19QZHTcYI0IeI646=r}Lm=6YMeNd=`zQe+zSvs&VR{%shMjAnz
z4vCbb9QqCmi`ar9Y$~En74C{5Y&!ati}M{87Hv8RGcxw$Mi%adBFyOMS0)^UZ;?x8
zfEjqpK-pI7wEyq?{r>+piEGRNGw@$AAS&&k-NH|Dd+W@{(O#=i&rnGyuCVx$f`-|O
gF_yOCRa7JBS7adi4hxIuLE(ddp@9o#;GZ(^412?A3IG5A

diff --git a/fdog/fdog_goes_assembly/fDOGassembly.py b/fdog/fdog_goes_assembly/fDOGassembly.py
deleted file mode 100644
index ad4c362..0000000
--- a/fdog/fdog_goes_assembly/fDOGassembly.py
+++ /dev/null
@@ -1,209 +0,0 @@
-############################ imports ###########################################
-import os
-########################### functions ##########################################
-
-
-def merge_regions(blast_results, cut_off):
-    number_regions = 0
-    for key in blast_results:
-        locations = blast_results[key]
-        size_list = len(locations)
-        i = 0
-        j = 1
-        old_size = 0
-        while size_list != old_size and i < size_list:
-            old_size = size_list
-            start = locations[i][0]
-            end = locations[i][1]
-
-            #print(locations)
-            while j < size_list:
-
-                # breakup point? or we have to skip this j
-                if (i == j) and (j + 1 < size_list):
-                    j+=1
-                elif (i == j):
-                    break
-
-                if (locations[i][0] < locations[j][0]) and (locations[i][1] > locations[j][0]):
-                    # start is between start and end -> merge
-                    locations[i][1] = max(locations[j][1], locations[i][1])
-                    locations[i][2] = min(locations[j][2], locations[i][2])
-                    locations.pop(j)
-                    j -= 1
-                elif (locations[i][0] < locations[j][1]) and (locations[i][1] > locations[j][1]):
-                    #end is between start and end -> merge
-                    locations[i][0] = min(locations[j][0], locations[i][0])
-                    locations[i][2] = min(locations[j][2], locations[i][2])
-                    locations.pop(j)
-                    j -= 1
-                elif (locations[i][0] > locations[j][1]) and (locations[i][0] - locations[j][1] <= cut_off):
-                    # end is not more than cut-off distanced
-                    locations[i][0] = locations[j][0]
-                    locations[i][2] = min(locations[j][2], locations[i][2])
-                    locations.pop(j)
-                    j -= 1
-                elif (locations[i][1] < locations[j][0] and locations[j][0] - locations[i][1] <= cut_off):
-                    # start is not more than cut-off distanced
-                    locations[i][0] = locations[j][0]
-                    locations[i][2] = min(locations[j][2], locations[i][2])
-                    locations.pop(j)
-                    j -= 1
-                j += 1
-                size_list = len(locations)
-
-            i += 1
-            j = 0
-        number_regions += size_list
-
-    return blast_results, number_regions
-
-
-def parse_blast(line, blast_results):
-    # format blast line:  <contig> <start> <end> <evalue> <score>
-    #fomrat dictionary: {node_name: [(<start>,<end>)]}
-    #print(line)
-    line = line.replace("\n", "")
-    line_info = line.split("\t")
-    #print(line_info)
-    evalue = float(line_info[3])
-
-    #cut off
-    if evalue > 0.0001:
-        return blast_results, evalue
-    #add region to dictionary
-    else:
-        node_name, start, end = line_info[0], line_info[1], line_info[2]
-        if node_name in blast_results:
-            list = blast_results[node_name]
-            list.append([int(start),int(end), evalue])
-            blast_results[node_name] = list
-        else:
-            blast_results[node_name] = [[int(start),int(end), evalue]]
-
-    return blast_results, evalue
-
-
-def candidate_regions(cut_off):
-    ###################### extracting candidate regions ########################
-    # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
-    blast_file = open("tmp/blast_results.out", "r")
-
-    evalue = 0
-    blast_results = {}
-    #parsing blast output
-    while True:
-        line = blast_file.readline()
-        #end of file is reached
-        if not line:
-            break
-        #parsing blast output
-        blast_results, evalue = parse_blast(line, blast_results)
-        #evalue cut-off
-        if not evalue <= 0.00001:
-            break
-    if blast_results == {}:
-        return 1
-    else:
-        candidate_regions, number_regions = merge_regions(blast_results, cut_off)
-        #print(candidate_regions, number_regions)
-        return candidate_regions, number_regions
-
-
-def extract_seq(region_dic, path):
-    #print(region_dic)
-    for key in region_dic:
-        os.system("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
-
-
-def main():
-
-    ########################### handle user input ##############################
-
-    #user input core_ortholog group
-    #have to add an input option
-
-    #core-ortholog group name
-    group = "778452"
-
-    #species name assemblie (folder name in assemby folder)
-    species_name = "L.pustulata"
-
-    #assembly species_name
-    assembly_name = "contigs.fa"
-
-    augustus_ref_species = "saccharomyces_cerevisiae_S288C"
-
-    cut_off_merging_candidates = 500
-
-
-    ########################## paths ###########################################
-
-    #open core_ortholog group
-    msa_path = "../data/core_orthologs/" + group +"/"+ group + ".aln"
-    hmm_path = "../data/core_orthologs/" + group +"/hmm_dir/"+ group + ".hmm"
-    consensus_path = "tmp/" + group + ".con"
-    profile_path = "tmp/" + group + ".prfl"
-    path_assembly = "../data/assembly_dir/" + species_name + "/" + assembly_name
-
-    os.system('mkdir tmp')
-
-
-    ######################## consensus sequence ################################
-
-    #make a majority-rule consensus seqeunce with the tool hmmemit from hmmer
-    print("Building a consensus sequence \n")
-    os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path)
-    print("consensus seqeunce is finished\n")
-
-    ######################## block profile #####################################
-    print("Building a block profile \n")
-
-    os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path)
-    print("block profile is finished \n")
-    ######################## tBLASTn ###########################################
-
-    #database anlegen
-    print("creating a blast database \n")
-    os.system('makeblastdb -in ' + path_assembly + ' -dbtype nucl -parse_seqids -out ' + path_assembly)
-    print("database is finished \n")
-
-    #make a tBLASTn search against the new database
-
-    os.system('tblastn -db ' + path_assembly + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue bitscore" -out tmp/blast_results.out')
-
-    ################### search for candidate regions and extract seq ###########
-
-    # parse blast and filter for candiate regions
-    regions, number_regions = candidate_regions(cut_off_merging_candidates)
-
-    if regions == 1:
-        #no candidat region are available, no ortholog can be found
-        print("No candidate region found")
-        os.system('rm -r tmp/')
-        return 1
-
-    else:
-        print(str(number_regions) + " candiate regions were found. Extracting sequences.")
-        extract_seq(regions, path_assembly)
-
-    ############### make Augustus PPX search ####################################
-    for key in regions:
-        locations = regions[key]
-        counter = 0
-        for i in locations:
-            counter += 1
-            start = str(i[0])
-            end = str(i[1])
-            if start < end:
-            #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff")
-                os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff")
-            else:
-                os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + end + " --predictionEnd=" + start + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff")
-
-    ################# remove tmp folder ########################################
-
-    #have to be added after program ist finished, maybe use parametere so that the user can turn it off
-
-if __name__ == '__main__':
-    main()
diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
new file mode 100644
index 0000000..1606b1d
--- /dev/null
+++ b/fdog/mergeAssemblyOutput.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+#######################################################################
+# Copyright (C) 2020 Vinh Tran
+#
+#  This script is used to merge all output files (.extended.fa, .phyloprofile,
+#  _forward.domains, _reverse.domains) in a given directory into one file each.
+#
+#  This script is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License <http://www.gnu.org/licenses/> for
+#  more details
+#
+#  Contact: hannah.muelbaier@stud.uni-frankfurt.de
+#
+#######################################################################
+
+import sys
+import os
+from os import listdir as ldir
+import argparse
+from pathlib import Path
+
+def main():
+    version = '0.0.1'
+    parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.')
+    parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
+                        action='store', default='', required=True)
+    parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True)
+    parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False)
+    args = parser.parse_args()
+
+    directory = args.input
+    out = args.output
+    cleanup = args.cleanup
+    if not os.path.exists(os.path.abspath(directory)):
+        sys.exit('%s not found' % directory)
+    else:
+        directory = os.path.abspath(directory)
+
+    phyloprofile = None
+    set_phylo = set()
+    domains_0 = None
+    set_domains_f = set()
+    domains_1 = None
+    set_domains_r = set()
+    ex_fasta = None
+    set_fasta = set()
+    header_bool = False
+    for infile in ldir(directory):
+        if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile':
+            if not phyloprofile:
+                phyloprofile = open(out + '.phyloprofile', 'w')
+                phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n')
+            with open(directory + '/' + infile, 'r') as reader:
+                lines = reader.readlines()
+                for line in lines:
+                    if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo:
+                        phyloprofile.write(line)
+                if len(lines) > 1:
+                    set_phylo = set(lines)
+            if cleanup == True:
+                os.remove(directory + '/' + infile)
+        elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains':
+            if not domains_0:
+                domains_0 = open(out + '_forward.domains', 'w')
+            with open(directory + '/' + infile, 'r') as reader:
+                lines = reader.readlines()
+                for line in lines:
+                    if line not in set_domains_f:
+                        domains_0.write(line)
+                if len(lines) > 1:
+                    set_domains_f = set(lines)
+            if cleanup == True:
+                os.remove(directory + '/' + infile)
+        elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains':
+            if not domains_1:
+                domains_1 = open(out + '_reverse.domains', 'w')
+            with open(directory + '/' + infile, 'r') as reader:
+                lines = reader.readlines()
+                for line in lines:
+                    if line not in set_domains_r:
+                        domains_1.write(line)
+                if len(lines) > 1:
+                    set_domains_r = set(lines)
+            if cleanup == True:
+                os.remove(directory + '/' + infile)
+        elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa':
+            if not ex_fasta:
+                ex_fasta = open(out + '.extended.fa', 'w')
+            with open(directory + '/' + infile, 'r') as reader:
+                lines = reader.readlines()
+                header = set()
+                #print(set_fasta)
+                for line in lines:
+                    if line[0] == ">":
+                        header.add(line)
+                        if line not in set_fasta:
+                            ex_fasta.write(line)
+                            header_bool = True
+                        else:
+                            header_bool = False
+                    else:
+                        if header_bool == True:
+                            ex_fasta.write(line)
+                set_fasta = header
+            if cleanup == True:
+                os.remove(directory + '/' +infile)
+        elif infile.endswith('.tsv'):
+            os.remove(directory + '/' + infile)
+
+    if phyloprofile:
+        phyloprofile.close()
+    if domains_0:
+        domains_0.close()
+    if domains_1:
+        domains_1.close()
+    if ex_fasta:
+        ex_fasta.close()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/fdog/runMulti.py b/fdog/runMulti.py
index 65335d5..a696495 100644
--- a/fdog/runMulti.py
+++ b/fdog/runMulti.py
@@ -28,6 +28,7 @@
 from tqdm import tqdm
 import fdog.runSingle as fdogFn
 import shutil
+import yaml
 
 def getSortedFiles(directory):
     list = os.listdir(directory)
@@ -46,8 +47,8 @@ def prepare(args, step):
     outpath, hmmpath, blastpath, searchpath, weightpath,
     coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
     fasoff, countercheck, coreFilter, minScore,
-    strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-    cpu, hyperthread, debug, silent) = args
+    strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
+    cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args
 
     mute = False
     if step == 'core':
@@ -67,9 +68,10 @@ def prepare(args, step):
     pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath]
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
-    orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
+    orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
     otherArgs = [cpu, hyperthread, debug, True]
-    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute)
+    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
+    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute)
 
 def getSeedName(seedFile):
     seqName = seedFile.split('.')[0]
@@ -104,17 +106,20 @@ def compileCore(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core')
-        coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute])
-    pool = mp.Pool(cpu)
-    coreOut = []
-    for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)):
-        coreOut.append(_)
-    pool.close()
-    pool.join()
+
+        if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)):
+            (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core')
+            coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute])
+    if len(coreCompilationJobs) > 0:
+        pool = mp.Pool(cpu)
+        coreOut = []
+        for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)):
+            coreOut.append(_)
+        pool.close()
+        pool.join()
+        # read logs file to get runtime for individual seeds
+        getIndividualRuntime('core', outpath, seeds)
     end = time.time()
-    # read logs file to get runtime for individual seeds
-    getIndividualRuntime('core', outpath, seeds)
     multiCoreTime = '{:5.3f}'.format(end-start)
     print('==> Core compiling finished in %s sec' % multiCoreTime) #'{:5.3f}s'.format(end-start))
     return(multiCoreTime)
@@ -126,7 +131,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
+        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
         if mute == True:
             print(seed)
         else:
@@ -139,7 +144,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath):
     print('==> Ortholog search finished in %s sec' % multiOrthoTime)
     return(multiOrthoTime)
 
-def joinOutputs(outpath, jobName, seeds, keep):
+def joinOutputs(outpath, jobName, seeds, keep, silent):
     print('Joining single outputs...')
     finalFa = '%s/%s.extended.fa' % (outpath, jobName)
     Path(outpath+'/singleOutput').mkdir(parents=True, exist_ok=True)
@@ -147,14 +152,20 @@ def joinOutputs(outpath, jobName, seeds, keep):
         for seed in seeds:
             seqName = getSeedName(seed)
             resultFile = '%s/%s/%s.extended.fa'  % (outpath, seqName, seqName)
+            if silent == False:
+                print(resultFile)
             if os.path.exists(resultFile):
                 with open(resultFile,'rb') as fd:
                     shutil.copyfileobj(fd, wfd)
                 shutil.move(outpath + '/' + seqName, outpath + '/singleOutput')
             else:
                 Path(outpath+'/missingOutput').mkdir(parents=True, exist_ok=True)
-                shutil.move(outpath + '/' + seqName, outpath + '/missingOutput')
+                if not os.path.exists(outpath + '/missingOutput/' + seqName):
+                    shutil.move(outpath + '/' + seqName, outpath + '/missingOutput')
+            if os.path.exists(outpath + '/' + seqName + '.fa'):
                 os.remove(outpath + '/' + seqName + '.fa')
+            if os.path.exists(os.getcwd() + '/' + seqName + '.fa'):
+                os.remove(os.getcwd() + '/' + seqName + '.fa')
     if keep == True:
         try:
             print('Compressing single outputs...')
@@ -180,7 +191,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu):
         sys.exit('Problem running\n%s' % (fasCmd))
 
 def main():
-    version = '0.0.13'
+    version = '0.0.33'
     parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
     required = parser.add_argument_group('Required arguments')
@@ -197,10 +208,12 @@ def main():
     optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='')
     optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='')
     optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='')
+    optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
 
     addtionalIO = parser.add_argument_group('Other I/O options')
     addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
     addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
+    addtionalIO.add_argument('--forceComplete', help='Overwrite existing core orthologs and all output files', action='store_true', default=False)
     addtionalIO.add_argument('--cleanup', help='Temporary output will be deleted. Default: True', action='store_true', default=True)
     addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False)
     addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='')
@@ -229,8 +242,15 @@ def main():
                                 action='store', default=3, type=int)
     core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05',
                                 action='store', default=0.05, type=float)
+    core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score',
+                                action='store_true', default=False)
+    core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True',
+                                action='store_true', default=True)
+    core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False',
+                                action='store_true', default=False)
 
     ortho_options = parser.add_argument_group('Search strategy options')
+    ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='')
     ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set',
                                 action='store_true', default=False)
     ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it',
@@ -239,9 +259,7 @@ def main():
                                 action='store_true', default=False)
     ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs',
                                 action='store_true', default=False)
-    ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score',
-                                action='store_true', default=False)
-    ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False',
+    ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False',
                                 action='store_true', default=False)
     ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005',
                                 action='store', default=0.00005, type=float)
@@ -257,13 +275,6 @@ def main():
                                 action='store_true', default=False)
     ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
                                 action='store', default=10, type=int)
-    ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle',
-                                choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
-    ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True',
-                                action='store_true', default=True)
-    ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False',
-                                action='store_true', default=False)
-    ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='')
 
     fas_options = parser.add_argument_group('FAS options')
     fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
@@ -274,11 +285,21 @@ def main():
     fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float)
 
     optional = parser.add_argument_group('Other options')
+    optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle',
+        choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int)
     optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False)
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
+    assembly_options = parser.add_argument_group('Assembly options')
+    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
+    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
+    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
+    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
+    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
+    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
+    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -297,10 +318,12 @@ def main():
     blastpath = args.blastpath
     searchpath = args.searchpath
     weightpath = args.weightpath
+    pathFile = args.pathFile
 
     # other I/O arguments
     append = args.append
     force = args.force
+    forceComplete = args.forceComplete
     cleanup = args.cleanup
     keep = args.keep
     group = args.group
@@ -323,7 +346,7 @@ def main():
     rbh = args.rbh
     rep = args.rep
     ignoreDistance = args.ignoreDistance
-    lowComplexityFilterOff = args.lowComplexityFilterOff
+    lowComplexityFilter = args.lowComplexityFilter
     evalBlast = args.evalBlast
     evalHmmer = args.evalHmmer
     evalRelaxfac = args.evalRelaxfac
@@ -351,22 +374,89 @@ def main():
         silent = False
     else:
         silent = True
+       
+    #fdog_goes_assembly arguments
+    assembly = args.assembly
+    assemblyFile = args.assemblyFile
+    augustusRefSpec = args.augustusRefSpec
+    avIntron = args.avIntron
+    lengthExtension = args.lengthExtension
+    searchTool = args.searchTool
+    matrix = args.scoringmatrix
+
+    ### check fas
+    if not fasoff:
+        try:
+            fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True)
+        except:
+            sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!')
+
+    ### delete output folder and files if needed
+    if forceComplete:
+        if os.path.exists(outpath):
+            print("Removing existing output directory %s" % outpath)
+            shutil.rmtree(outpath)
+            Path(outpath).mkdir(parents=True, exist_ok=True)
+    if force:
+        if os.path.exists(outpath):
+            print("Removing existing files %s in %s*" % (jobName, outpath))
+            outfiles = os.listdir(outpath)
+            for item in outfiles:
+                if item.startswith(jobName):
+                    os.remove(os.path.join(outpath, item))
+                if item.startswith("runtime"):
+                    os.remove(os.path.join(outpath, item))
+            if os.path.exists(outpath + '/missing.txt'):
+                os.remove(outpath + '/missing.txt')
 
     ### get fdog and data path
+    dataPath = ''
     fdogPath = os.path.realpath(__file__).replace('/runMulti.py','')
     pathconfigFile = fdogPath + '/bin/pathconfig.txt'
     if not os.path.exists(pathconfigFile):
         sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
-    with open(pathconfigFile) as f:
-        dataPath = f.readline().strip()
+    if pathFile == '':
+        with open(pathconfigFile) as f:
+            dataPath = f.readline().strip()
+    else:
+        cfg = fdogFn.load_config(pathFile)
+        try:
+            dataPath = cfg['dataPath']
+        except:
+            dataPath = 'config'
+
     if hmmpath == '':
-        hmmpath = dataPath + '/core_orthologs'
+        hmmpath = outpath + '/core_orthologs'
+        # hmmpath = dataPath + '/core_orthologs'
+        # if dataPath == 'config':
+        #     try:
+        #         hmmpath = cfg['hmmpath']
+        #     except:
+        #         sys.exit('hmmpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile)
+    else:
+        hmmpath = os.path.abspath(hmmpath)
     if blastpath == '':
         blastpath = dataPath + '/blast_dir'
+        if dataPath == 'config':
+            try:
+                blastpath = cfg['blastpath']
+            except:
+                sys.exit('blastpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile)
     if searchpath == '':
         searchpath = dataPath + '/genome_dir'
+        if dataPath == 'config':
+            try:
+                searchpath = cfg['searchpath']
+            except:
+                sys.exit('searchpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile)
     if weightpath == '':
         weightpath = dataPath + '/weight_dir'
+        if dataPath == 'config':
+            try:
+                weightpath = cfg['weightpath']
+            except:
+                sys.exit('weightpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile)
+
 
     ### join options
     options = [fdogPath, refspec, minDist, maxDist, coreOrth,
@@ -374,10 +464,11 @@ def main():
                 outpath, hmmpath, blastpath, searchpath, weightpath,
                 coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
                 fasoff, countercheck, coreFilter, minScore,
-                strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-                cpu, hyperthread, debug, silent]
+                strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
+                cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
 
     ### START
+    Path(outpath).mkdir(parents=True, exist_ok=True)
     multiLog = open(outpath + '/' + jobName + '_log.txt', "w")
     fdogStart = time.time()
     seeds = getSortedFiles(inFol)
@@ -388,30 +479,40 @@ def main():
     if reuseCore == False:
         multiCoreTime = compileCore(options, seeds, inFol, cpu, outpath)
         multiLog.write('==> Core compilation finished in %s sec\n' % multiCoreTime)
+    else:
+        if not os.path.exists(hmmpath):
+            sys.exit('--reuseCore was set, but no core orthologs found in %s! You could use --hmmpath to manually specify the core ortholog directory.' % outpath)
 
     ### do ortholog search
     if coreOnly == False:
-        ### create list of search taxa
-        searchTaxa = ''
-        searchGroup = 'all'
-        if not group == '':
-            print('Creating list for search taxa...')
-            searchTaxa = '%s/searchTaxa.txt' % (outpath)
-            searchGroup = group
-            cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa)
-            try:
-                subprocess.call([cmd], shell = True)
-            except:
-                sys.exit('Problem running\n%s' % (cmd))
-        ### run ortholog search
-        multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath)
-        multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime)
-        ### join output
-        finalFa = joinOutputs(outpath, jobName, seeds, keep)
+        if not os.path.exists('%s/%s.extended.fa' % (outpath, jobName)):
+            ### create list of search taxa
+            searchTaxa = ''
+            searchGroup = 'all'
+            if not group == '':
+                print('Creating list for search taxa...')
+                searchTaxa = '%s/searchTaxa.txt' % (outpath)
+                searchGroup = group
+                cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa)
+                try:
+                    subprocess.call([cmd], shell = True)
+                except:
+                    sys.exit('Problem running\n%s' % (cmd))
+            ### run ortholog search
+            multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath)
+            multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime)
+            ### join output
+            finalFa = joinOutputs(outpath, jobName, seeds, keep, silent)
+        else:
+            print("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force option." % (jobName, outpath))
         ### calculate FAS scores
         if fasoff == False:
-            fasTime = calcFAS(outpath, finalFa, weightpath, cpu)
-            multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime)
+            if not os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)):
+                if os.path.exists(finalFa) and os.path.getsize(finalFa) > 0:
+                    fasTime = calcFAS(outpath, finalFa, weightpath, cpu)
+                    multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime)
+                else:
+                    print("Final fasta file %s not exists or empty!" % finalFa)
 
     fdogEnd = time.time()
     print('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart))
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index f235ff8..a0ded09 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -20,13 +20,24 @@
 import argparse
 import subprocess
 from pathlib import Path
+import yaml
 
 def checkFileExist(file):
     if not os.path.exists(os.path.abspath(file)):
         sys.exit('%s not found' % file)
 
+def load_config(config_file):
+    with open(config_file, 'r') as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
 def checkInput(args):
     (fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath) = args
+    # create output directory
+    Path(outpath).mkdir(parents=True, exist_ok=True)
+    Path(hmmpath).mkdir(parents=True, exist_ok=True)
     # check path existing
     for path in [hmmpath, blastpath, searchpath, weightpath]:
         checkFileExist(path)
@@ -38,8 +49,6 @@ def checkInput(args):
             seqFile = fdogPath + '/data/' + seqFile
     else:
         seqFile = os.path.abspath(seqFile)
-    # create output directory
-    Path(outpath).mkdir(parents=True, exist_ok=True)
     # check refspec
     if not os.path.exists(os.path.abspath(blastpath+'/'+refspec)):
         exit('Reference taxon %s not found in %s' % (refspec, blastpath))
@@ -56,13 +65,13 @@ def getfdogInfo(fdogPath, infoType):
         exit('%s not found' % (fdogPath + '/bin/oneSeq.pl'))
 
 def runSingle(args):
-    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args
+    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args
     # basic command
     (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs
     cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec)
     # add paths
-    (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs
-    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath)
+    (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs
+    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath)
     # add other I/O options
     (append, force, noCleanup, group, blast, db) = ioArgs
     if append == True:
@@ -98,7 +107,7 @@ def runSingle(args):
     if not distDeviation == 0.05:
         cmd = cmd + ' -distDeviation=%s' % distDeviation
     # add ortholo search options
-    (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs
+    (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs
     if strict == True:
         cmd = cmd + ' -strict'
     if checkCoorthologsRef == True:
@@ -109,8 +118,8 @@ def runSingle(args):
         cmd = cmd + ' -rep'
     if ignoreDistance == True:
         cmd = cmd + ' -ignoreDistance'
-    if lowComplexityFilterOff == True:
-        cmd = cmd + ' -filter=F'
+    if lowComplexityFilter == True:
+        cmd = cmd + ' -filter=T'
     if not evalBlast == 0.00005:
         cmd = cmd + ' -evalBlast=%s' % evalBlast
     if not evalHmmer == 0.00005:
@@ -152,7 +161,28 @@ def runSingle(args):
         cmd = cmd + ' -debug'
     if silent == True:
         cmd = cmd + ' -silent'
-    # print(cmd)
+    # add assembly options
+    (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs
+    if assembly == True:
+        cmd = cmd + ' -assembly'
+        cmd = cmd + ' -reuseCore'
+        if not augustusRefSpec == '':
+            cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec
+        else:
+            sys.exit('An augutus reference species is requiered by using the option --assembly')
+        if not avIntron == '':
+            cmd = cmd + ' -avIntron=%s' % avIntron
+        if not lengthExtension == '':
+            cmd = cmd + ' -lengthExtension=%s' % lengthExtension
+        if not assemblyFile == '':
+            cmd = cmd + ' -assemblyFile=%s' % assemblyFile
+        if not searchTool == '':
+            cmd = cmd + ' -searchTool=%s' % searchTool
+        if not matrix == '':
+            cmd = cmd + ' -scoringmatrix=%s' % matrix
+        if not dataPath == '':
+            cmd = cmd + ' -dataPath=%s' % dataPath
+    #print(cmd)
     if mute == True:
         cmd = cmd + ' > /dev/null 2>&1'
     try:
@@ -161,7 +191,7 @@ def runSingle(args):
         sys.exit('Problem running\n%s' % (cmd))
 
 def main():
-    version = '0.0.13'
+    version = '0.0.33'
     parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
     required = parser.add_argument_group('Required arguments')
@@ -178,6 +208,9 @@ def main():
     optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='')
     optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='')
     optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='')
+    optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
+    optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='')
+
 
     addtionalIO = parser.add_argument_group('Other I/O options')
     addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
@@ -209,8 +242,15 @@ def main():
                                 action='store', default=3, type=int)
     core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05',
                                 action='store', default=0.05, type=float)
+    core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score',
+                                action='store_true', default=False)
+    core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True',
+                                action='store_true', default=True)
+    core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False',
+                                action='store_true', default=False)
 
     ortho_options = parser.add_argument_group('Ortholog search strategy options')
+    ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='')
     ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set',
                                 action='store_true', default=False)
     ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it',
@@ -219,9 +259,7 @@ def main():
                                 action='store_true', default=False)
     ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs',
                                 action='store_true', default=False)
-    ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score',
-                                action='store_true', default=False)
-    ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False',
+    ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False',
                                 action='store_true', default=False)
     ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005',
                                 action='store', default=0.00005, type=float)
@@ -237,13 +275,6 @@ def main():
                                 action='store_true', default=False)
     ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
                                 action='store', default=10, type=int)
-    ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle',
-                                choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
-    ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True',
-                                action='store_true', default=True)
-    ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False',
-                                action='store_true', default=False)
-    ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='')
 
     fas_options = parser.add_argument_group('FAS options')
     fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
@@ -254,11 +285,21 @@ def main():
     fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float)
 
     optional = parser.add_argument_group('Other options')
+    optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle',
+        choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int)
     optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False)
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
+    assembly_options = parser.add_argument_group('Assembly options')
+    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
+    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
+    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
+    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
+    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
+    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
+    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -277,6 +318,8 @@ def main():
     blastpath = args.blastpath
     searchpath = args.searchpath
     weightpath = args.weightpath
+    pathFile = args.pathFile
+    assemblypath = args.assemblypath
 
     # other I/O arguments
     append = args.append
@@ -302,7 +345,7 @@ def main():
     rbh = args.rbh
     rep = args.rep
     ignoreDistance = args.ignoreDistance
-    lowComplexityFilterOff = args.lowComplexityFilterOff
+    lowComplexityFilter = args.lowComplexityFilter
     evalBlast = args.evalBlast
     evalHmmer = args.evalHmmer
     evalRelaxfac = args.evalRelaxfac
@@ -331,36 +374,86 @@ def main():
     else:
         silent = True
 
+    #fdog_goes_assembly arguments
+    assembly = args.assembly
+    assemblyFile = args.assemblyFile
+    augustusRefSpec = args.augustusRefSpec
+    avIntron = args.avIntron
+    lengthExtension = args.lengthExtension
+    searchTool = args.searchTool
+    matrix = args.scoringmatrix
+
     ### get fdog and data path
+    dataPath = ''
     fdogPath = os.path.realpath(__file__).replace('/runSingle.py','')
     pathconfigFile = fdogPath + '/bin/pathconfig.txt'
     if not os.path.exists(pathconfigFile):
         sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
-    with open(pathconfigFile) as f:
-        dataPath = f.readline().strip()
+    if pathFile == '':
+        with open(pathconfigFile) as f:
+            dataPath = f.readline().strip()
+    else:
+        cfg = load_config(pathFile)
+        try:
+            dataPath = cfg['dataPath']
+        except:
+            dataPath = 'config'
+
     if hmmpath == '':
-        hmmpath = dataPath + '/core_orthologs'
+        hmmpath = outpath + '/core_orthologs'
+    #     hmmpath = dataPath + '/core_orthologs'
+    #     if dataPath == 'config':
+    #         try:
+    #             hmmpath = cfg['hmmpath']
+    #         except:
+    #             sys.exit('hmmpath not found in %s' % pathFile)
+
     if blastpath == '':
         blastpath = dataPath + '/blast_dir'
+        if dataPath == 'config':
+            try:
+                blastpath = cfg['blastpath']
+            except:
+                sys.exit('blastpath not found in %s' % pathFile)
     if searchpath == '':
         searchpath = dataPath + '/genome_dir'
+        if dataPath == 'config':
+            try:
+                searchpath = cfg['searchpath']
+            except:
+                sys.exit('searchpath not found in %s' % pathFile)
     if weightpath == '':
         weightpath = dataPath + '/weight_dir'
+        if dataPath == 'config':
+            try:
+                weightpath = cfg['weightpath']
+            except:
+                sys.exit('weightpath not found in %s' % pathFile)
+
+    if assemblypath == '':
+        assemblypath = dataPath + '/assembly_dir'
+        if dataPath == 'config':
+            try:
+                assemblypath = cfg['assemblypath']
+            except:
+                sys.exit('assemblypath not found in %s' % pathFile)
+    if assembly == True:
+        searchpath = assemblypath
 
     ### check input arguments
     seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath])
-
     # group arguments
     basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth]
     ioArgs = [append, force, noCleanup, group, blast, db]
-    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath]
+    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath]
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
-    orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
+    orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
     otherArgs = [cpu, hyperthread, debug, silent]
+    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath]
 
     ### run fdog
-    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False])
+    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False])
 
 if __name__ == '__main__':
     main()
diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 2894e6c..3c561e7 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -199,7 +199,8 @@ fi
 data_fdog_file="data_HaMStR-2019c.tar.gz"
 checkSumData="1748371655 621731824 $data_fdog_file"
 cd $outDir
-if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi
+if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi
+if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi
 
 if ! [ "$(ls -A $outDir/genome_dir)" ]; then
   echo "-------------------------------------"
diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index b8c90e6..ddc4e23 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -116,6 +116,7 @@ dependencies=(
   mafft # for linsi
   muscle
   fasta36
+  augustus #for fdog.assembly
 )
 
 for i in "${dependencies[@]}"; do
@@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do
       fi
     elif [ "$tool" = "fasta36" ]; then
       conda install -y -c bioconda fasta3
+    elif [ "$tool" = "augustus" ]; then
+      conda install -y -c bioconda augustus
     else
       conda install -y -c bioconda $i
     fi
@@ -258,7 +261,8 @@ echo "done!"
 data_fdog_file="data_HaMStR-2019c.tar.gz"
 checkSumData="1748371655 621731824 $data_fdog_file"
 cd $outDir
-if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi
+if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi
+if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi
 
 if ! [ "$(ls -A $outDir/genome_dir)" ]; then
   echo "-------------------------------------"
diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py
index 18c5368..b6a67d6 100644
--- a/fdog/setupfDog.py
+++ b/fdog/setupfDog.py
@@ -20,6 +20,7 @@
 import os
 import argparse
 import subprocess
+from ete3 import NCBITaxa
 from pathlib import Path
 
 def checkOptConflict(lib, conda):
@@ -28,7 +29,7 @@ def checkOptConflict(lib, conda):
             sys.exit('*** ERROR: --lib and --conda cannot be used at the same time!')
 
 def main():
-    version = '0.0.2'
+    version = '0.0.3'
     parser = argparse.ArgumentParser(description='You are running fdog.setup version ' + str(version) + '.')
     required = parser.add_argument_group('required arguments')
     optional = parser.add_argument_group('optional arguments')
@@ -60,6 +61,9 @@ def main():
                 dataPath = f.readline().strip()
                 print(dataPath)
         sys.exit()
+    ### get ncbi taxonomy database for ete3
+    print('Creating local NCBI taxonomy database...')
+    ncbi = NCBITaxa()
     ### run setup
     if conda:
         setupFile = '%s/setup/setup_conda.sh -o %s' % (fdogPath, outPath)
diff --git a/setup.py b/setup.py
index ad7a1b7..75573c1 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,8 @@
 
 setup(
     name="fdog",
-    version="0.0.13",
+    version="0.0.33",
+
     python_requires='>=3.7.0',
     description="Feature-aware Directed OrtholoG search tool",
     long_description=long_description,
@@ -41,7 +42,8 @@
         'tqdm',
         'ete3',
         'six',
-        'greedyFAS>=1.4.0'
+        'PyYAML',
+        'greedyFAS>=1.5.0'
     ],
     entry_points={
         'console_scripts': ["fdog.run = fdog.runSingle:main",
@@ -52,7 +54,9 @@
                             "fdog.addTaxa = fdog.addTaxa:main",
                             "fdog.showTaxa = fdog.showTaxa:main",
                             "fdog.mergeOutput = fdog.mergeOutput:main",
-                            "fdog.remove = fdog.removefDog:main"],
+                            "fdog.remove = fdog.removefDog:main",
+                            "fdog.assembly = fdog.fDOGassembly:main",
+                            "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"],
     },
     license="GPL-3.0",
     classifiers=[

From f8ccac590b46677d81bcc25516666b626a261f2b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 30 Jun 2021 15:09:08 +0200
Subject: [PATCH 094/192] measure computational time

---
 fdog/fDOGassembly.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index de9f343..3b34a8d 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -8,6 +8,7 @@
 import argparse
 import yaml
 import subprocess
+import time
 ########################### functions ##########################################
 def load_config(config_file):
     with open(config_file, 'r') as stream:
@@ -428,10 +429,6 @@ def cleanup(tmp, tmp_path):
     if tmp == False:
         os.system('rm -r ' + tmp_path)
 
-def checkOptions():
-    pass
-    #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
-
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
         return candidate_names
@@ -520,6 +517,8 @@ def main():
 
     #################### handle user input ########################################
 
+    start = time.clock()
+
     version = '0.0.1'
 
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
@@ -796,6 +795,7 @@ def main():
     ############### make Annotation with FAS ###################################
         # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
+            fas = time.clock()
             print("Calculating FAS scores")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
@@ -816,6 +816,7 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
+        fas = time.clock()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
@@ -833,5 +834,10 @@ def main():
 
     f.close()
 
+    end = time.clock()
+
+    print("Time w/o FAS: " + str(end-fas))
+    print("Time complete: " + str(end-start))
+
 if __name__ == '__main__':
     main()

From 1cf64f1f03dc07357e576744ba3751261b59a77b Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 30 Jun 2021 16:25:17 +0200
Subject: [PATCH 095/192] measure computational time

---
 fdog/fDOGassembly.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3b34a8d..229a546 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -517,7 +517,7 @@ def main():
 
     #################### handle user input ########################################
 
-    start = time.clock()
+    start = time.time()
 
     version = '0.0.1'
 
@@ -795,7 +795,7 @@ def main():
     ############### make Annotation with FAS ###################################
         # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
-            fas = time.clock()
+            fas = time.time()
             print("Calculating FAS scores")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
@@ -816,7 +816,7 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
-        fas = time.clock()
+        fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
@@ -834,7 +834,7 @@ def main():
 
     f.close()
 
-    end = time.clock()
+    end = time.time()
 
     print("Time w/o FAS: " + str(end-fas))
     print("Time complete: " + str(end-start))

From 6e163ba531b1816eb5faa8f4b315e6e1e5c448ff Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 30 Jun 2021 16:32:40 +0200
Subject: [PATCH 096/192] bug fix

---
 fdog/fDOGassembly.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 229a546..a6a3bb8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -832,12 +832,14 @@ def main():
     else:
         cleanup(tmp, out + "/tmp/")
 
-    f.close()
+
 
     end = time.time()
 
     print("Time w/o FAS: " + str(end-fas))
     print("Time complete: " + str(end-start))
 
+    f.close()
+
 if __name__ == '__main__':
     main()

From 1d1c47a572015d2cef9e121705993d29090fceee Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 1 Jul 2021 09:46:57 +0200
Subject: [PATCH 097/192] testing

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a6a3bb8..5ff5cb1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -836,6 +836,7 @@ def main():
 
     end = time.time()
 
+    sys.stdout = sys.__stdout__
     print("Time w/o FAS: " + str(end-fas))
     print("Time complete: " + str(end-start))
 

From 6e0ce726ffd6425574bbdb73901285463b3af5a4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 1 Jul 2021 10:12:06 +0200
Subject: [PATCH 098/192] computational time output

---
 fdog/fDOGassembly.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 5ff5cb1..c8a096b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -837,8 +837,7 @@ def main():
     end = time.time()
 
     sys.stdout = sys.__stdout__
-    print("Time w/o FAS: " + str(end-fas))
-    print("Time complete: " + str(end-start))
+    print(group + "\t" + str(end-start) + "\t" + str(end-start))
 
     f.close()
 

From a1cb75d31205dec99f6fe8ef4a6f164395086af3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 1 Jul 2021 10:18:54 +0200
Subject: [PATCH 099/192] corrected computational time output

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c8a096b..a3ac854 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -837,7 +837,7 @@ def main():
     end = time.time()
 
     sys.stdout = sys.__stdout__
-    print(group + "\t" + str(end-start) + "\t" + str(end-start))
+    print(group + "\t" + str(end-fas) + "\t" + str(end-start))
 
     f.close()
 

From 328f26dda0e5e1eaaf22dfd37658f5af795c802d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 20 Jul 2021 15:21:28 +0200
Subject: [PATCH 100/192] automatic augustus installation during setup

---
 fdog/fDOGassembly.py      | 20 +++++++++++++++++++-
 fdog/setup/install_lib.sh |  9 ++++++++-
 fdog/setup/setup.sh       |  1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a3ac854..2575b05 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,3 +1,21 @@
+# -*- coding: utf-8 -*-
+
+#######################################################################
+# Copyright (C) 2020 Hannah Muelbaier
+#
+#  This script is used to run fDOG-Assembly which performs targeted ortholog
+#  searches on genome assemblies
+#
+#  This script is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License <http://www.gnu.org/licenses/> for
+#  more details
+#
+#  Contact: hannah.muelbaier@gmail.com
+#
+#######################################################################
+
 ############################ imports ###########################################
 import os
 import os.path
@@ -519,7 +537,7 @@ def main():
 
     start = time.time()
 
-    version = '0.0.1'
+    version = '0.1.1'
 
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh
index ff81e88..2e8ff02 100755
--- a/fdog/setup/install_lib.sh
+++ b/fdog/setup/install_lib.sh
@@ -85,6 +85,7 @@ dependenciesUbuntu=(
   perl-doc
   locales
   lib32z1
+  augustus
 )
 
 dependenciesMac=(
@@ -94,6 +95,7 @@ dependenciesMac=(
   mafft
   brewsci/bio/muscle
   blast
+  augustus
 )
 
 if [ "$sys" == "Darwin" ]; then
@@ -108,7 +110,11 @@ else
   sudo apt-get update -y
   for i in "${dependenciesUbuntu[@]}"; do
     echo $i
-    sudo apt-get install -y -qq $i > /dev/null
+    if ["$i" == "augustus"]; then
+      sudo apt install augustus > /dev/null
+    else
+      sudo apt-get install -y -qq $i > /dev/null
+    fi
   done
 fi
 
@@ -119,6 +125,7 @@ dependencies=(
   mafft
   muscle
   blastn
+  augustus
 )
 
 for i in "${dependencies[@]}"; do
diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 3c561e7..d9e0077 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -309,6 +309,7 @@ mafft
 muscle
 clustalw
 blastp
+augustus
 )
 
 for i in "${dependencies[@]}"; do

From 594715da279c625b4b9ff03fca153c7bcfde4695 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Jul 2021 15:43:59 +0200
Subject: [PATCH 101/192] added tblastn version check

---
 fdog/setup/setup.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index d9e0077..96ac1c1 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -310,6 +310,7 @@ muscle
 clustalw
 blastp
 augustus
+tblastn
 )
 
 for i in "${dependencies[@]}"; do
@@ -319,6 +320,13 @@ for i in "${dependencies[@]}"; do
       tool="clustalw2"
     fi
   fi
+  if [ $tool == tblastn]; then
+    requiredver="2.9.0"
+    currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+    if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
+      echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+    fi
+  fi
   if [ -z "$(which $tool)" ]; then
     echo -e "\t\e[31mWARNING $tool not found!\e[0m"
     flag=1

From be91b3b6d3577b91bf69c73bec3a2dec4c316d5c Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Jul 2021 15:48:43 +0200
Subject: [PATCH 102/192] bug fix

---
 fdog/setup/setup.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 96ac1c1..e562ca8 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -320,10 +320,10 @@ for i in "${dependencies[@]}"; do
       tool="clustalw2"
     fi
   fi
-  if [ $tool == tblastn]; then
+  if [ $tool == "tblastn"]; then
     requiredver="2.9.0"
     currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
-    if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
+    if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
       echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
     fi
   fi

From 4b5fb49a019ab87560a383fa05e0f40e2143b501 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Jul 2021 15:55:12 +0200
Subject: [PATCH 103/192] bug fix

---
 fdog/setup/setup.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index e562ca8..d5d740b 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -320,12 +320,12 @@ for i in "${dependencies[@]}"; do
       tool="clustalw2"
     fi
   fi
-  if [ $tool == "tblastn"]; then
+  if [ $tool == "tblastn" ]; then
     requiredver="2.9.0"
     currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
-    if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
-      echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
-    fi
+    # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
+    #   echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+    # fi
   fi
   if [ -z "$(which $tool)" ]; then
     echo -e "\t\e[31mWARNING $tool not found!\e[0m"

From c630d75f8ce7710924482f03bdf3e19796d471ac Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Jul 2021 16:15:41 +0200
Subject: [PATCH 104/192] testing BLAST version check

---
 fdog/setup/setup.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index d5d740b..1f74552 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -323,9 +323,10 @@ for i in "${dependencies[@]}"; do
   if [ $tool == "tblastn" ]; then
     requiredver="2.9.0"
     currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
-    # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then
-    #   echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
-    # fi
+    t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+    if [ $t == $currentver ]; then
+      echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+    fi
   fi
   if [ -z "$(which $tool)" ]; then
     echo -e "\t\e[31mWARNING $tool not found!\e[0m"

From f31cebf94a9c5161023182b307630f0f6d9e1e50 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 21 Jul 2021 16:22:54 +0200
Subject: [PATCH 105/192] tblastn version check during fdog.setup --conda

---
 fdog/setup/setup_conda.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index ddc4e23..7b4bd08 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -369,6 +369,8 @@ clustalw
 mafft
 muscle
 fasta3
+augustus
+tblastn
 )
 for i in "${condaPkgs[@]}"; do
   if [[ -z $(conda list | $grepprog "$i ") ]]; then
@@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do
       progname="hmmsearch"
     elif [ "$i" == "fasta3" ]; then
       progname="fasta36"
+    elif [ "$i" == "tblastn" ]; then
+      requiredver="2.9.0"
+      currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+      t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+      if [ $t == $currentver ]; then
+        echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+      fi
     fi
     if [ -z "$(which $progname)" ]; then
       echo -e "\t\e[31m$i could not be installed\e[0m"

From 6edf7a01486af4cde7da9a2a028936d5f7710d86 Mon Sep 17 00:00:00 2001
From: mueli94 <47216555+mueli94@users.noreply.github.com>
Date: Mon, 2 Aug 2021 13:36:20 +0200
Subject: [PATCH 106/192] Fdog goes assembly (#10)

* bug fix

* bug fix

* fixed error mapping ID file not found

* testing

* testing

* testing

* test

* test

* testing

* testing

* testing

* testing

* fDOGassembly is working on complete assembly_dir

* bug fix

* bug fix

* enabled option -filter for blastp search

* bug fix fasoff

* testing --strict option

* bug fix in --strict option, output is corrected

* bug fix in --checkCoorthologsRef

* bug fix

* clean up

* bug fix

* adapted handling of variable dataPath

* testing

* testing

* testing

* testing

* test

* test

* test

* test

* test

* test

* testing

* bug fix assemblyDir

* testing

* testing

* testing search taxa

* test

* enable --searchTaxa option in fdog.assembly

* bug fix

* testing

* testing --searchTaxa adaption

* testing

* test

* test

* write debug files to output dir

* skip fa.mapping while checking genome_dir

* testing

* bug fix

* testing

* bug fix

* bug fix

* path fix in augustus_ppx

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* bug fix

* testing

* testing

* added new python script to merge Assembly output from the same Gene but different searchTaxa

* added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa

* bug fix

* corrected fdog.mergeAssembly call

* testing

* testing

* testing

* test

* moved fdog.mergeAssembly call to another place

* testing

* testing

* testing

* testing

* testing

* testing

* corrected fdog.mergeAssembly call

* testing

* testing

* testing

* testing

* test

* disable weight_dir check if option --assembly is used

* adapted fdog.assembly call

* adapted calcFAS call to deactivate .tsv output

* testing

* testing

* bug fix in function backward search used with option --strict

* testing new added option --silent

* added more checks to fdogs.run

* bug fix

* testing

* testing

* testing

* bug fix

* bug fix

* testing

* testing silent mode

* testing --silent

* symlinks for fasta36 input; improved fdogs.run according to #5

* testing

* testing

* testing

* testing

* tetsing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* test

* test

* testing

* testing new function to identify coorthologs

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* finished function coorthologs

* bug fix runSingle.py

* cleaning output

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* bug fix if augutus can't idetify a gene at a candidate region

* testing

* bug fix

* bug fix

* cleaning up

* testing

* testing

* testing

* testing

* bug fix in merge function, regions in minus strand were not merged correctly

* testing

* testing

* testing

* testing

* testing

* bug fix

* testing

* testing

* testing

* testing

* testing

* clean up

* testing

* testing

* testing

* testing

* bug fix

* testing new tblastn call

* testing

* testing

* testing

* testing

* testing

* code clean up

* clean up code

* clean up

* clean up

* reduce output

* clean up code

* check augustus

* testing

* adding option to recognize if co-ortholog or not in header of the extended.fa

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* added function starting_subprocess() to handle call of extern tools more easily

* added augustus to dependencies

* testing

* bug fix

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* testing

* added function to clean up .domain files

* testing

* testing

* testing

* testing

* improve user output

* fdog.assembly started with fDOG is always silent

* testing

* testing output

* testing

* testing

* testing

* testing

* testing

* removing automatically .tsv files if existing

* measure computational time

* measure computational time

* bug fix

* testing

* computational time output

* corrected computational time output

* automatic augustus installation during setup

* added tblastn version check

* bug fix

* bug fix

* testing BLAST version check

* tblastn version check during fdog.setup --conda

Co-authored-by: trvinh <trvinh@gmail.com>
---
 fdog/fDOGassembly.py      | 34 +++++++++++++++++++++++++++++-----
 fdog/setup/install_lib.sh |  9 ++++++++-
 fdog/setup/setup.sh       | 10 ++++++++++
 fdog/setup/setup_conda.sh |  9 +++++++++
 4 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index de9f343..46f83c0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,3 +1,21 @@
+# -*- coding: utf-8 -*-
+
+#######################################################################
+# Copyright (C) 2020 Hannah Muelbaier
+#
+#  This script is used to run fDOG-Assembly which performs targeted ortholog
+#  searches on genome assemblies
+#
+#  This script is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License <http://www.gnu.org/licenses/> for
+#  more details
+#
+#  Contact: hannah.muelbaier@gmail.com
+#
+#######################################################################
+
 ############################ imports ###########################################
 import os
 import os.path
@@ -8,6 +26,8 @@
 import argparse
 import yaml
 import subprocess
+import time
+=======
 ########################### functions ##########################################
 def load_config(config_file):
     with open(config_file, 'r') as stream:
@@ -428,10 +448,6 @@ def cleanup(tmp, tmp_path):
     if tmp == False:
         os.system('rm -r ' + tmp_path)
 
-def checkOptions():
-    pass
-    #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
-
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
         return candidate_names
@@ -520,7 +536,10 @@ def main():
 
     #################### handle user input ########################################
 
-    version = '0.0.1'
+    start = time.time()
+
+    version = '0.1.1'
+
 
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
@@ -796,6 +815,7 @@ def main():
     ############### make Annotation with FAS ###################################
         # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
+            fas = time.time()
             print("Calculating FAS scores")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
@@ -816,6 +836,7 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
+        fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
@@ -831,6 +852,9 @@ def main():
     else:
         cleanup(tmp, out + "/tmp/")
 
+    end = time.time()
+    sys.stdout = sys.__stdout__
+    #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
     f.close()
 
 if __name__ == '__main__':
diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh
index ff81e88..2e8ff02 100755
--- a/fdog/setup/install_lib.sh
+++ b/fdog/setup/install_lib.sh
@@ -85,6 +85,7 @@ dependenciesUbuntu=(
   perl-doc
   locales
   lib32z1
+  augustus
 )
 
 dependenciesMac=(
@@ -94,6 +95,7 @@ dependenciesMac=(
   mafft
   brewsci/bio/muscle
   blast
+  augustus
 )
 
 if [ "$sys" == "Darwin" ]; then
@@ -108,7 +110,11 @@ else
   sudo apt-get update -y
   for i in "${dependenciesUbuntu[@]}"; do
     echo $i
-    sudo apt-get install -y -qq $i > /dev/null
+    if ["$i" == "augustus"]; then
+      sudo apt install augustus > /dev/null
+    else
+      sudo apt-get install -y -qq $i > /dev/null
+    fi
   done
 fi
 
@@ -119,6 +125,7 @@ dependencies=(
   mafft
   muscle
   blastn
+  augustus
 )
 
 for i in "${dependencies[@]}"; do
diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 3c561e7..1f74552 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -309,6 +309,8 @@ mafft
 muscle
 clustalw
 blastp
+augustus
+tblastn
 )
 
 for i in "${dependencies[@]}"; do
@@ -318,6 +320,14 @@ for i in "${dependencies[@]}"; do
       tool="clustalw2"
     fi
   fi
+  if [ $tool == "tblastn" ]; then
+    requiredver="2.9.0"
+    currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+    t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+    if [ $t == $currentver ]; then
+      echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+    fi
+  fi
   if [ -z "$(which $tool)" ]; then
     echo -e "\t\e[31mWARNING $tool not found!\e[0m"
     flag=1
diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index ddc4e23..7b4bd08 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -369,6 +369,8 @@ clustalw
 mafft
 muscle
 fasta3
+augustus
+tblastn
 )
 for i in "${condaPkgs[@]}"; do
   if [[ -z $(conda list | $grepprog "$i ") ]]; then
@@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do
       progname="hmmsearch"
     elif [ "$i" == "fasta3" ]; then
       progname="fasta36"
+    elif [ "$i" == "tblastn" ]; then
+      requiredver="2.9.0"
+      currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+      t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+      if [ $t == $currentver ]; then
+        echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+      fi
     fi
     if [ -z "$(which $progname)" ]; then
       echo -e "\t\e[31m$i could not be installed\e[0m"

From 1b4232e6cd214650007e5c24055f3c8618fe01ae Mon Sep 17 00:00:00 2001
From: mueli94 <47216555+mueli94@users.noreply.github.com>
Date: Mon, 2 Aug 2021 13:41:02 +0200
Subject: [PATCH 107/192] Added link to fDOG-Assembly poster for QfO 6.5

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 52f11e2..9343943 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
 [![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG)
 ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg)
 
+# Poster fDOG - Assembly
+(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf)
 # Table of Contents
 * [How to install](#how-to-install)
      * [Install the fDOG package](#install-the-fdog-package)

From 4798b8fea54782a68d60935bb157ad28cfeaaadb Mon Sep 17 00:00:00 2001
From: mueli94 <47216555+mueli94@users.noreply.github.com>
Date: Mon, 2 Aug 2021 13:41:22 +0200
Subject: [PATCH 108/192] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9343943..8db83ce 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg)
 
 # Poster fDOG - Assembly
-(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf)
+https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf
 # Table of Contents
 * [How to install](#how-to-install)
      * [Install the fDOG package](#install-the-fdog-package)

From d64177c3cb0a6afd8a89687a4ee8196f2f85fc7d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 10:22:47 +0200
Subject: [PATCH 109/192] added option checkOff

---
 fdog/fDOGassembly.py | 8 +++++---
 fdog/runMulti.py     | 8 ++++----
 fdog/runSingle.py    | 2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 46f83c0..424b6e3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 #######################################################################
-# Copyright (C) 2020 Hannah Muelbaier
+# Copyright (C) 2021 Hannah Muelbaier
 #
 #  This script is used to run fDOG-Assembly which performs targeted ortholog
 #  searches on genome assemblies
@@ -538,7 +538,7 @@ def main():
 
     start = time.time()
 
-    version = '0.1.1'
+    version = '0.1.2'
 
 
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
@@ -668,7 +668,6 @@ def main():
     else:
         sys.stdout = Logger(f)
 
-    # user input has to be checked here before fDOGassembly continues
     assembly_names = os.listdir(assemblyDir)
 
     ########################## some variables ##################################
@@ -683,6 +682,9 @@ def main():
     consensus_path = out + "/tmp/" + group + ".con"
     profile_path = out + "/tmp/" + group + ".prfl"
 
+    ##################### need a check to see if reference species is part of the core group !##########
+
+
     ###################### create tmp folder ###################################
 
     cmd = 'mkdir ' + out + '/tmp'
diff --git a/fdog/runMulti.py b/fdog/runMulti.py
index 6862f6d..c19b598 100644
--- a/fdog/runMulti.py
+++ b/fdog/runMulti.py
@@ -48,7 +48,7 @@ def prepare(args, step):
     coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
     fasoff, countercheck, coreFilter, minScore,
     strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-    cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args
+    cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args
 
 
     mute = False
@@ -70,7 +70,7 @@ def prepare(args, step):
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
-    otherArgs = [cpu, hyperthread, debug, True]
+    otherArgs = [cpu, hyperthread, checkOff, debug, True]
     assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
     return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute)
 
@@ -378,7 +378,7 @@ def main():
         silent = False
     else:
         silent = True
-       
+
     #fdog_goes_assembly arguments
     assembly = args.assembly
     assemblyFile = args.assemblyFile
@@ -472,7 +472,7 @@ def main():
                 coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
                 fasoff, countercheck, coreFilter, minScore,
                 strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-                cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
+                cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
 
     ### START
     Path(outpath).mkdir(parents=True, exist_ok=True)
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index 1b8a943..c65300f 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -453,7 +453,7 @@ def main():
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
-    otherArgs = [cpu, hyperthread, debug, silent]
+    otherArgs = [cpu, hyperthread, checkOff, debug, silent]
     assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath]
 
     ### run fdog

From ef6b0dc6903837130cbac00ea9d6f499e1330373 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 11:17:54 +0200
Subject: [PATCH 110/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 424b6e3..1800a0a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,7 +27,7 @@
 import yaml
 import subprocess
 import time
-=======
+
 ########################### functions ##########################################
 def load_config(config_file):
     with open(config_file, 'r') as stream:

From d4bf11fb965dcd512790f3ec164c237deaa3a9d4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 12:06:06 +0200
Subject: [PATCH 111/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1800a0a..bf272d7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -837,7 +837,7 @@ def main():
         cleanup(tmp, tmp_path)
         return 1
     #if we searched in more than one taxon
-    if fasoff == False and searchTaxon == '':
+    if fasoff == False and searchTaxon == '' and len(assembly_names) > 1:
         fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'

From 62badce99d56fb4e634335e20db8cafebcfd89a3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 12:10:35 +0200
Subject: [PATCH 112/192] testing

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bf272d7..2b5eaf7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -838,6 +838,8 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '' and len(assembly_names) > 1:
+        print(len(assembly_names))
+        print(assembly_names)
         fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'

From a51b8f4a0c0b60a33d47f4908efa8630bd67dfca Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 12:11:58 +0200
Subject: [PATCH 113/192] testing

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bf272d7..2b5eaf7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -838,6 +838,8 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '' and len(assembly_names) > 1:
+        print(len(assembly_names))
+        print(assembly_names)
         fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'

From 147bbc9df5d5bf36382ddd222b9c081d061a3797 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 12:21:26 +0200
Subject: [PATCH 114/192] fixed

---
 fdog/fDOGassembly.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 2b5eaf7..bf272d7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -838,8 +838,6 @@ def main():
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '' and len(assembly_names) > 1:
-        print(len(assembly_names))
-        print(assembly_names)
         fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'

From a992e322ca4fd459de9d0d99d867622548dc1af7 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 14:49:14 +0200
Subject: [PATCH 115/192] fixed FAS call

---
 fdog/fDOGassembly.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index bf272d7..9c12e9a 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -831,8 +831,7 @@ def main():
 
 
     #if we searched in more than one Taxon and no ortholog was found
-
-    if refBool == False and searchTaxon == '':
+    if refBool == False and searchTaxon == '' and len(assembly_names) > 1:
         print("No orthologs found. Exciting ...")
         cleanup(tmp, tmp_path)
         return 1
@@ -843,7 +842,7 @@ def main():
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-        cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+        cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, 'silent')
         clean_fas(out + group + "_forward.domains", 'domains')
         clean_fas(out + group + "_reverse.domains", 'domains')

From abea0980ae0ba82e1565d45cabffdb455e85cdce Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 10 Sep 2021 15:29:06 +0200
Subject: [PATCH 116/192] changed FAS call

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 9c12e9a..950aef3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -823,7 +823,7 @@ def main():
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
             cmd = 'mkdir ' + tmp_path + 'anno_dir'
             starting_subprocess(cmd, 'silent')
-            cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
+            cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
             starting_subprocess(cmd, 'silent')
             clean_fas(fasOutFile + "_forward.domains", 'domains')
             clean_fas(fasOutFile + "_reverse.domains", 'domains')
@@ -831,12 +831,12 @@ def main():
 
 
     #if we searched in more than one Taxon and no ortholog was found
-    if refBool == False and searchTaxon == '' and len(assembly_names) > 1:
+    if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
         cleanup(tmp, tmp_path)
         return 1
     #if we searched in more than one taxon
-    if fasoff == False and searchTaxon == '' and len(assembly_names) > 1:
+    if fasoff == False and searchTaxon == '':
         fas = time.time()
         print("Calculating FAS scores")
         tmp_path = out + '/tmp/'

From d56b83e9cd76ce678b756de5856572d86b31a563 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 14 Sep 2021 16:29:15 +0200
Subject: [PATCH 117/192] new function that checks if input path exist and new
 function that check if reference species is part of core_group, multiple
 reference species were accepted, improved output

---
 fdog/fDOGassembly.py | 78 ++++++++++++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 950aef3..b27fcbe 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -29,6 +29,26 @@
 import time
 
 ########################### functions ##########################################
+def check_path(path):
+    if not os.path.exists(path):
+        print(path + " does not exist. Exciting ...")
+        sys.exit()
+
+def check_ref_sepc(species_list, fasta_file):
+    file = open(fasta_file, "r")
+    lines = file.readlines()
+    species_file = []
+
+    for line in lines:
+        if line[0] == ">":
+            species = line.split("|")[1]
+            species_file.append(species)
+    for species in species_list:
+        if species in species_file:
+            return species
+    print("Reference species is not part of the ortholog group. Exciting ...")
+    sys.exit()
+
 def load_config(config_file):
     with open(config_file, 'r') as stream:
         try:
@@ -298,40 +318,40 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
             id, gene, evalue = (line.replace("\n", "")).split("\t")
             gene_name = gene.split("|")[2]
             if gene_name != old_name:
-                print("candidate:%s"%(gene_name))
-                print("blast-hit:%s"%(id))
+                print("candidate:%s"%(gene_name)) if mode == "debug" else ""
+                print("blast-hit:%s"%(id)) if mode == "debug" else ""
                 min = float(evalue)
                 if id in id_ref:
                     orthologs.append(gene)
-                    print("\thitting\n")
+                    print("\thitting\n") if mode == "debug" else ""
                 else:
                     if checkCo == True:
                         for i in id_ref:
-                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
                             co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
                             if co_orthologs_result == 1:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                                 orthologs.append(gene)
                             elif co_orthologs_result == 0:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                     else:
-                        print("\tnothitting\n")
+                        print("\tnothitting\n") if mode == "debug" else ""
             elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs:
                 if id in id_ref:
                     orthologs.append(gene)
-                    print("\thitting\n")
+                    print("\thitting\n") if mode == "debug" else ""
                 else:
                     if checkCo == True:
                         for i in id_ref:
-                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
                             co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
                             if co_orthologs_result == 1:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                                 orthologs.append(gene)
                             elif co_orthologs_result == 0:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                     else:
-                        print("\tnot hitting\n")
+                        print("\tnot hitting\n") if mode == "debug" else ""
             old_name = gene_name
 
 
@@ -548,7 +568,7 @@ def main():
     required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
                             action='store', default='', required=True)
     required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
-    required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True)
+    required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True)
 
     optional = parser.add_argument_group('Optional arguments')
     optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
@@ -611,6 +631,7 @@ def main():
     silent = args.silent
     debug = args.debug
 
+    # output modes
     if debug == True and silent == True:
         print("It's not possible to use booth modes, please restart and use --debug or --silent")
         return 1
@@ -637,22 +658,27 @@ def main():
                 dataPath = cfg['dataPath']
             except:
                 dataPath = 'config'
-    if core_path == '':
-        core_path = out + '/core_orthologs/'
-    else:
-        if not core_path.endswith('/'):
-            core_path = core_path + '/'
 
-    if assemblyDir == '':
-        assemblyDir = dataPath + '/assembly_dir/'
+
     if out == '':
-        #print('test out \n')
         out = os.getcwd()
         os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
         out = out + '/' + group + '/'
     else:
         if out[-1] != "/":
             out = out + "/"
+        check_path(out)
+
+    if core_path == '':
+        core_path = out + '/core_orthologs/'
+    else:
+        if not core_path.endswith('/'):
+            core_path = core_path + '/'
+        check_path(core_path)
+
+    if assemblyDir == '':
+        assemblyDir = dataPath + '/assembly_dir/'
+    check_path(assemblyDir)
 
 
     try:
@@ -674,16 +700,20 @@ def main():
 
     refBool = False # checks if sequences of reference species were already part of the extended.fa file
 
-    ########### paths ###########
+    ################################# paths ####################################
 
     msa_path = core_path + "/" + group +"/"+ group + ".aln"
+    check_path(msa_path)
     hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
+    check_path(hmm_path)
     fasta_path = core_path + "/" + group +"/"+ group + ".fa"
+    check_path(fasta_path)
     consensus_path = out + "/tmp/" + group + ".con"
     profile_path = out + "/tmp/" + group + ".prfl"
 
-    ##################### need a check to see if reference species is part of the core group !##########
+    ############## is fDOG reference species part of ortholog group? ###########
 
+    fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path)
 
     ###################### create tmp folder ###################################
 
@@ -842,7 +872,7 @@ def main():
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-        cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+        cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, 'silent')
         clean_fas(out + group + "_forward.domains", 'domains')
         clean_fas(out + group + "_reverse.domains", 'domains')

From 343199263b697131ca6fcac375aa59b3e10b7458 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 15 Sep 2021 15:11:25 +0200
Subject: [PATCH 118/192] improved user output

---
 fdog/fDOGassembly.py | 59 ++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b27fcbe..232090d 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -554,22 +554,21 @@ def flush(self):
 
 def main():
 
-    #################### handle user input ########################################
+    #################### handle user input #####################################
 
     start = time.time()
 
     version = '0.1.2'
-
-
+    ################### initialize parser ######################################
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
-
+    ################## required arguments ######################################
     required = parser.add_argument_group('Required arguments')
     required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
                             action='store', default='', required=True)
     required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
     required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True)
-
+    ################## optional arguments ######################################
     optional = parser.add_argument_group('Optional arguments')
     optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
     optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int)
@@ -592,7 +591,6 @@ def main():
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
 
-
     args = parser.parse_args()
 
     # required
@@ -711,7 +709,7 @@ def main():
     consensus_path = out + "/tmp/" + group + ".con"
     profile_path = out + "/tmp/" + group + ".prfl"
 
-    ############## is fDOG reference species part of ortholog group? ###########
+    ########### is/are fDOG reference species part of ortholog group? ##########
 
     fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path)
 
@@ -720,32 +718,33 @@ def main():
     cmd = 'mkdir ' + out + '/tmp'
     starting_subprocess(cmd, 'silent')
 
+    print("Gene: " + group)
+    print("fDOG reference species: " + fdog_ref_species + " \n")
+
     ######################## consensus sequence ################################
 
     #make a majority-rule consensus sequence with the tool hmmemit from hmmer
-    print("Building a consensus sequence for gene " + group + " \n")
+    print("Building a consensus sequence")
     cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
     starting_subprocess(cmd, mode)
-    print("consensus sequence is finished\n")
+    print("\t ...finished\n")
 
     ######################## block profile #####################################
 
-    print("Building a block profile for gene " + group + " \n")
+    print("Building a block profile ...")
     cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
     starting_subprocess(cmd, 'silent')
 
     if int(os.path.getsize(profile_path)) > 0:
-        print("block profile is finished \n")
+        print("\t ...finished \n")
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
         new_path = core_path + group +"/"+ group + "_new.aln"
-        #print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         starting_subprocess(cmd, mode)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
-        #print(cmd)
         starting_subprocess(cmd, 'silent')
-        print("block profile is finished \n")
+        print(" \t ...finished \n")
 
     searchBool = False
 
@@ -780,19 +779,17 @@ def main():
         #checks if data base exists already
         db_check = searching_for_db(db_path)
         if db_check == 0:
-            print("creating a blast data base \n")
+            print("Creating a blast data base...")
             cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
             starting_subprocess(cmd, mode)
-            print("database is finished \n")
-        else:
-            print('blast data base exists already, continuing...')
+            print("\t ...finished \n")
 
-        #makes a tBLASTn search against the new database
+        #makes a tBLASTn search against database
         #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-        print("tBLASTn search against data base")
+        print("Starting tBLASTn search...")
         cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
         starting_subprocess(cmd, mode)
-        print("tBLASTn search is finished")
+        print("\t ...finished")
 
     ################### search for candidate regions and extract seq ###########
     # parse blast and filter for candiate regions
@@ -800,25 +797,25 @@ def main():
 
         if regions == 0:
             #no candidat region are available, no ortholog can be found
-            print("No candidate region found")
+            print("No candidate region found!\n")
             if refBool == True:
                 continue
             else:
                 taxa = [fdog_ref_species]
                 reciprocal_sequences = 0
         else:
-            print(str(number_regions) + " candiate regions were found. Extracting sequences...")
+            print(str(number_regions) + " candiate regions were found.\n")
             extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
 
-            print("starting augustus ppx \n")
+            print("Starting augustus ppx ...")
             augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-            print("augustus is finished \n")
+            print("\t ...finished \n")
 
     ################# backward search to filter for orthologs###################
             if int(os.path.getsize(candidatesOutFile)) <= 0:
-                print("No genes found at candidate regions\n")
+                print("No genes found at candidate region\n")
                 if searchTaxon == '' and refBool == True:
                     continue
                 else:
@@ -831,7 +828,7 @@ def main():
     ################## checking accepted genes for co-orthologs ################
         if reciprocal_sequences == 0:
             if regions != 0:
-                print("No ortholog fulfilled the reciprocity criteria")
+                print("No ortholog fulfilled the reciprocity criteria \n")
             if searchTaxon == '' and refBool == True:
                 continue
             else:
@@ -848,7 +845,7 @@ def main():
         # if we want to search in only one Taxon
         if searchTaxon != '' and fasoff == False:
             fas = time.time()
-            print("Calculating FAS scores")
+            print("Calculating FAS scores ...")
             fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
             # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
             cmd = 'mkdir ' + tmp_path + 'anno_dir'
@@ -858,6 +855,7 @@ def main():
             clean_fas(fasOutFile + "_forward.domains", 'domains')
             clean_fas(fasOutFile + "_reverse.domains", 'domains')
             clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
+            print("\t ...finished \n")
 
 
     #if we searched in more than one Taxon and no ortholog was found
@@ -868,7 +866,7 @@ def main():
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
         fas = time.time()
-        print("Calculating FAS scores")
+        print("Calculating FAS scores ...")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
@@ -877,6 +875,7 @@ def main():
         clean_fas(out + group + "_forward.domains", 'domains')
         clean_fas(out + group + "_reverse.domains", 'domains')
         clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+        print("\t ...finished \n")
     ################# remove tmp folder ########################################
     if searchTaxon != '':
         cleanup(tmp, tmp_path)
@@ -886,7 +885,9 @@ def main():
     end = time.time()
     sys.stdout = sys.__stdout__
     #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
+    print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.")
     f.close()
 
+
 if __name__ == '__main__':
     main()

From a843bfeec60a534776ec3f1e7c036c880a7b2e74 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 17 Sep 2021 10:52:37 +0200
Subject: [PATCH 119/192] added timeout for tblastn search, fixed bug during
 delition of tmp folder,

---
 fdog/fDOGassembly.py | 56 ++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 232090d..c54590c 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -27,6 +27,7 @@
 import yaml
 import subprocess
 import time
+import shutil
 
 ########################### functions ##########################################
 def check_path(path):
@@ -56,13 +57,17 @@ def load_config(config_file):
         except yaml.YAMLError as exc:
             print(exc)
 
-def starting_subprocess(cmd, mode):
-    if mode == 'debug':
-        result = subprocess.run(cmd, shell=True)
-    elif mode == 'silent':
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-    elif mode == 'normal':
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
+def starting_subprocess(cmd, mode, time_out = None):
+
+    try:
+        if mode == 'debug':
+            result = subprocess.run(cmd, shell=True, timeout = time_out)
+        elif mode == 'silent':
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out)
+        elif mode == 'normal':
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out)
+    except subprocess.TimeoutExpired:
+        return 1
 
 def merge(blast_results, insert_length):
     #merging overlapping and contigous candidate regions
@@ -162,10 +167,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
 
     if blast_results == {}:
+        blast_file.close()
         return 0,0
     else:
         candidate_regions, number_regions = merge(blast_results, intron_length)
-
+        blast_file.close()
         return candidate_regions, number_regions
 
 def extract_seq(region_dic, path, tmp_path, mode):
@@ -270,6 +276,10 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
     if msaTool == "muscle":
         os.system("muscle -quiet -in " + output_file + " -out " + aln_file)
         #print("muscle -quiet -in " + output_file + " -out " + aln_file)
+        if not os.path.exists(aln_file):
+            print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.")
+            os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
+
     elif msaTool == "mafft-linsi":
         #print("mafft-linsi")
         os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
@@ -461,12 +471,13 @@ def createFasInput(orthologsOutFile, mappingFile):
         ncbi_id = (seq.id.split("@"))[1]
         mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n")
 
-
+    mappingFile.close()
     return fas_seed_id
 
 def cleanup(tmp, tmp_path):
     if tmp == False:
-        os.system('rm -r ' + tmp_path)
+        while os.path.exists(tmp_path):
+            shutil.rmtree(tmp_path, ignore_errors=True)
 
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
@@ -537,6 +548,7 @@ def clean_fas(path, file_type):
             new_line = id + "\t" + remain
 
         file.write(new_line)
+    file.close()
 
 class Logger(object):
     def __init__(self, file):
@@ -708,6 +720,7 @@ def main():
     check_path(fasta_path)
     consensus_path = out + "/tmp/" + group + ".con"
     profile_path = out + "/tmp/" + group + ".prfl"
+    tmp_folder = out + "/tmp"
 
     ########### is/are fDOG reference species part of ortholog group? ##########
 
@@ -760,7 +773,7 @@ def main():
 
         cmd = 'mkdir ' + out + '/tmp/' + asName
         starting_subprocess(cmd, 'silent')
-        tmp_path = out + "/tmp/" + asName + "/"
+        tmp_path = out + "tmp/" + asName + "/"
         candidatesOutFile = tmp_path + group + ".candidates.fa"
         if searchTaxon != '':
             orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
@@ -788,8 +801,14 @@ def main():
         #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
         print("Starting tBLASTn search...")
         cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
-        starting_subprocess(cmd, mode)
-        print("\t ...finished")
+        exit_code = starting_subprocess(cmd, mode, 3600)
+        if exit_code == 1:
+            print("The tblastn search takes too long. Exciting ...")
+            f.close()
+            cleanup(tmp, tmp_folder)
+            sys.exit()
+        else:
+            print("\t ...finished")
 
     ################### search for candidate regions and extract seq ###########
     # parse blast and filter for candiate regions
@@ -861,7 +880,8 @@ def main():
     #if we searched in more than one Taxon and no ortholog was found
     if refBool == False and searchTaxon == '':
         print("No orthologs found. Exciting ...")
-        cleanup(tmp, tmp_path)
+        f.close()
+        cleanup(tmp, tmp_folder)
         return 1
     #if we searched in more than one taxon
     if fasoff == False and searchTaxon == '':
@@ -878,14 +898,16 @@ def main():
         print("\t ...finished \n")
     ################# remove tmp folder ########################################
     if searchTaxon != '':
-        cleanup(tmp, tmp_path)
+        f.close()
+        cleanup(tmp, tmp_folder)
     else:
-        cleanup(tmp, out + "/tmp/")
+        f.close()
+        cleanup(tmp, tmp_folder)
 
     end = time.time()
     sys.stdout = sys.__stdout__
     #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
-    print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.")
+    print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
     f.close()
 
 

From 36fc207095c5f865547ad6a5b152632ebb71f575 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 20 Sep 2021 16:45:31 +0200
Subject: [PATCH 120/192] added options force and append

---
 fdog/fDOGassembly.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c54590c..09795e4 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -28,6 +28,7 @@
 import subprocess
 import time
 import shutil
+import multiprocessing as mp
 
 ########################### functions ##########################################
 def check_path(path):
@@ -602,6 +603,8 @@ def main():
     optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
+    optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
+    optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
 
     args = parser.parse_args()
 
@@ -640,6 +643,8 @@ def main():
     searchTaxon = args.searchTaxon
     silent = args.silent
     debug = args.debug
+    force = args.force
+    append = args.append
 
     # output modes
     if debug == True and silent == True:
@@ -672,13 +677,25 @@ def main():
 
     if out == '':
         out = os.getcwd()
-        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
-        out = out + '/' + group + '/'
     else:
         if out[-1] != "/":
             out = out + "/"
         check_path(out)
 
+    if os.path.exists(out + '/' + group):
+        if append != True and force != True:
+            print("Output folder for group " + group + " exists already. Please choose --force or --append.")
+            sys.exit()
+        elif force == True:
+            shutil.rmtree(out + '/' + group, ignore_errors=True)
+        elif append == True:
+            refBool = True # checks if sequences of reference species were already part of the extended.fa file
+        else:
+            refBool = False # checks if sequences of reference species were already part of the extended.fa file
+    else:
+        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+        out = out + '/' + group + '/'
+
     if core_path == '':
         core_path = out + '/core_orthologs/'
     else:
@@ -704,11 +721,9 @@ def main():
     else:
         sys.stdout = Logger(f)
 
-    assembly_names = os.listdir(assemblyDir)
+    ########################### other variables ################################
 
-    ########################## some variables ##################################
-
-    refBool = False # checks if sequences of reference species were already part of the extended.fa file
+    assembly_names = os.listdir(assemblyDir)
 
     ################################# paths ####################################
 

From 2e17db197f2e3e70f0c372a56314fc4722647770 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 23 Sep 2021 13:59:58 +0200
Subject: [PATCH 121/192] tested --foce and --append, only the 10 best
 candidate regions (regarding score) will be evaluated

---
 fdog/fDOGassembly.py | 65 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 09795e4..ae29b29 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -72,13 +72,13 @@ def starting_subprocess(cmd, mode, time_out = None):
 
 def merge(blast_results, insert_length):
     #merging overlapping and contigous candidate regions
+    #format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>, <score>)]}
     number_regions = 0
     insert_length = int(insert_length)
+    score_list = []
     for key in blast_results:
         locations = blast_results[key]
         locations = sorted(locations, key = lambda x: int(x[3]))
-        #print("test")
-        #print(locations)
         size_list = len(locations)
         j = 0
         while j < size_list-1:
@@ -88,6 +88,8 @@ def merge(blast_results, insert_length):
                     #merge overlapping regions plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
@@ -95,6 +97,8 @@ def merge(blast_results, insert_length):
                     #merge overlapping regions minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
@@ -102,6 +106,8 @@ def merge(blast_results, insert_length):
                     #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
@@ -109,20 +115,24 @@ def merge(blast_results, insert_length):
                     #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
                 i += 1
             j += 1
 
+        for entry in locations:
+            score_list.append(entry[6])
         number_regions += len(locations)
         blast_results[key] = locations
 
-    return blast_results, number_regions
+    return blast_results, number_regions, score_list
 
 def parse_blast(line, blast_results, cutoff):
-    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend>
-    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
+    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend> <score>
+    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>, <score>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
     evalue = float(line_info[3])
@@ -131,7 +141,7 @@ def parse_blast(line, blast_results, cutoff):
         return blast_results, evalue
     #add region to dictionary
     else:
-        node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5])
+        node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6])
         split = node_name.split("|")
         # finding out on which strand tBLASTn found a hit
         if sstart < send:
@@ -145,14 +155,32 @@ def parse_blast(line, blast_results, cutoff):
             node_name = split[1]
         if node_name in blast_results:
             list = blast_results[node_name]
-            list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand])
+            list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score])
             blast_results[node_name] = list
         else:
-            blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]]
+            blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]]
 
     return blast_results, evalue
 
-def candidate_regions(intron_length, cutoff_evalue, tmp_path):
+def get_x_results(blast_dic, x, score_list):
+
+    new_dic = {}
+    score_list.sort(reverse=True)
+    min = score_list[x - 1]
+    number_regions = 0
+
+    for key in blast_dic:
+        key_list = []
+        entries = blast_dic[key]
+        for i in entries:
+            if i[6] >= min:
+                key_list.append(i)
+        if key_list != []:
+            new_dic[key] = key_list
+            number_regions += len(key_list)
+    return new_dic, number_regions
+
+def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10):
     ###################### extracting candidate regions ########################
     # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
     blast_file = open(tmp_path + "/blast_results.out", "r")
@@ -171,8 +199,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         blast_file.close()
         return 0,0
     else:
-        candidate_regions, number_regions = merge(blast_results, intron_length)
+        candidate_regions, number_regions, score_list = merge(blast_results, intron_length)
         blast_file.close()
+        if number_regions > x:
+            candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list)
         return candidate_regions, number_regions
 
 def extract_seq(region_dic, path, tmp_path, mode):
@@ -551,6 +581,10 @@ def clean_fas(path, file_type):
         file.write(new_line)
     file.close()
 
+def ortholog_search():
+    
+    pass
+
 class Logger(object):
     def __init__(self, file):
         self.file = file
@@ -583,7 +617,7 @@ def main():
     required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True)
     ################## optional arguments ######################################
     optional = parser.add_argument_group('Optional arguments')
-    optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
+    optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int)
     optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int)
     optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='')
     optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False)
@@ -688,13 +722,18 @@ def main():
             sys.exit()
         elif force == True:
             shutil.rmtree(out + '/' + group, ignore_errors=True)
+            refBool = False
+            os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+            out = out + '/' + group + '/'
         elif append == True:
-            refBool = True # checks if sequences of reference species were already part of the extended.fa file
+            out = out + '/' + group + '/'
+            refBool = True
         else:
             refBool = False # checks if sequences of reference species were already part of the extended.fa file
     else:
         os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
         out = out + '/' + group + '/'
+        refBool = False
 
     if core_path == '':
         core_path = out + '/core_orthologs/'
@@ -815,7 +854,7 @@ def main():
         #makes a tBLASTn search against database
         #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
         print("Starting tBLASTn search...")
-        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
         exit_code = starting_subprocess(cmd, mode, 3600)
         if exit_code == 1:
             print("The tblastn search takes too long. Exciting ...")

From 80562870c5d6395f3aa9cb256281dea1c157104a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 28 Sep 2021 16:07:06 +0200
Subject: [PATCH 122/192] create a function that performs the ortholog search
 and returns the headers of the found sequences and the corresponding tmp file
 in which the sequence is located

---
 fdog/fDOGassembly.py | 315 +++++++++++++++++++++++++++++--------------
 1 file changed, 214 insertions(+), 101 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ae29b29..37b7095 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -464,6 +464,38 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     orthologs = set(orthologs)
     return list(orthologs), seed
 
+def addRef(output, core_fasta, species_list):
+    print(species_list)
+    output_file = open(output, "a+")
+    seq_records_core = readFasta(core_fasta)
+    seq_records_core = list(seq_records_core)
+    for species in species_list:
+        for entry_core in seq_records_core:
+            if species in entry_core.id:
+                output_file.write(">" + entry_core.id + "\n")
+                output_file.write(str(entry_core.seq) + "\n")
+    output_file.close()
+
+def addSeq(output, seq_list):
+    output_file = open(output, "a+")
+
+    for item in seq_list:
+        candidate_fasta = item[0]
+        sequenceIds = item[1]
+        if sequenceIds == 0 or sequenceIds == []:
+            pass
+        seq_records_candidate = readFasta(candidate_fasta)
+        seq_records_candidate = list(seq_records_candidate)
+        for entry_candidate in seq_records_candidate:
+            if entry_candidate.id in sequenceIds:
+                if entry_candidate.id == sequenceIds[0]:
+                    output_file.write(">" + entry_candidate.id + "|1" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+                else:
+                    output_file.write(">" + entry_candidate.id + "|0" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+    output_file.close()
+
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
 
     output_file = open(output, "a+")
@@ -581,9 +613,69 @@ def clean_fas(path, file_type):
         file.write(new_line)
     file.close()
 
-def ortholog_search():
-    
-    pass
+def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs):
+    cmd = 'mkdir ' + out + '/tmp/' + asName
+    starting_subprocess(cmd, 'silent')
+    tmp_path = out + "tmp/" + asName + "/"
+    candidatesOutFile = tmp_path + group + ".candidates.fa"
+    #orthologsOutFile = out + "/" + group + ".extended.fa"
+    fasOutFile = out + "/" + group
+    #mappingFile = out + "/tmp/" + group + ".mapping.txt"
+
+    print("Searching in species " + asName + "\n")
+    assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
+    db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
+    db_check = searching_for_db(db_path)
+
+    if db_check == 0:
+        print("Creating a blast data base...")
+        cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+        starting_subprocess(cmd, mode)
+        print("\t ...finished \n")
+
+    #makes a tBLASTn search against database
+    #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+    print("Starting tBLASTn search...")
+    cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+    exit_code = starting_subprocess(cmd, mode, 3600)
+    if exit_code == 1:
+        print("The tblastn search takes too long. Exciting ...")
+        f.close()
+        cleanup(tmp, tmp_folder)
+        sys.exit()
+    else:
+        print("\t ...finished")
+
+    regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+    if regions == 0:
+        #no candidat region are available, no ortholog can be found
+        print("No candidate region found for species %s!\n" % asName)
+        return [], candidatesOutFile
+
+    else:
+        print(str(number_regions) + " candiate regions were found for species %s.\n" % asName)
+        extract_seq(regions, db_path, tmp_path, mode)
+
+    ############### make Augustus PPX search ###################################
+    print("Starting augustus ppx ...")
+    augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+    print("\t ...finished \n")
+
+    ################# backward search to filter for orthologs###################
+    if int(os.path.getsize(candidatesOutFile)) <= 0:
+        print("No genes found at candidate regions\n")
+        return [], candidatesOutFile
+
+    reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+
+    if reciprocal_sequences == 0:
+        if regions != 0:
+            print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+        return [], candidatesOutFile
+    else:
+        reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+
+    return reciprocal_sequences, candidatesOutFile
 
 class Logger(object):
     def __init__(self, file):
@@ -639,7 +731,7 @@ def main():
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
     optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
     optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
-
+    optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False)
     args = parser.parse_args()
 
     # required
@@ -679,6 +771,7 @@ def main():
     debug = args.debug
     force = args.force
     append = args.append
+    parallel = args.parallel
 
     # output modes
     if debug == True and silent == True:
@@ -815,120 +908,140 @@ def main():
 
     searchBool = False
 
-    #################### fDOG assembly computation for all species #############
-    for asName in assembly_names:
-        if searchBool == True:
-            break
-        if searchTaxon != '' and searchBool == False:
-            asName = searchTaxon
-            searchBool = True
+    if searchTaxon == '':
+        ortholog_sequences = []
+        cpus = mp.cpu_count()
+        print(cpus)
+        #pool = mp.Pool(cpus)
+        for asName in assembly_names:
+            reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs)
+            ortholog_sequences.append([candidatesOutFile, reciprocal_sequences])
+
+        orthologsOutFile = out + "/" + group + ".extended.fa"
+
+        if taxa == []:
+            taxa = [fdog_ref_species]
+        addRef(orthologsOutFile, fasta_path, taxa)
+        addSeq(orthologsOutFile, ortholog_sequences)
+        refBool = True
+        mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-        ################### path definitions ###################################
 
-        cmd = 'mkdir ' + out + '/tmp/' + asName
-        starting_subprocess(cmd, 'silent')
-        tmp_path = out + "tmp/" + asName + "/"
-        candidatesOutFile = tmp_path + group + ".candidates.fa"
-        if searchTaxon != '':
-            orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
-            fasOutFile = out + "/" + group + "_" + asName
-            mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
-        else:
-            orthologsOutFile = out + "/" + group + ".extended.fa"
-            fasOutFile = out + "/" + group
-            mappingFile = out + "/tmp/" + group + ".mapping.txt"
-
-        print("Searching in species " + asName + "\n")
-        assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
-        db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-
-    ######################## tBLASTn ###########################################
-        #checks if data base exists already
-        db_check = searching_for_db(db_path)
-        if db_check == 0:
-            print("Creating a blast data base...")
-            cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
-            starting_subprocess(cmd, mode)
-            print("\t ...finished \n")
-
-        #makes a tBLASTn search against database
-        #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-        print("Starting tBLASTn search...")
-        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
-        exit_code = starting_subprocess(cmd, mode, 3600)
-        if exit_code == 1:
-            print("The tblastn search takes too long. Exciting ...")
-            f.close()
-            cleanup(tmp, tmp_folder)
-            sys.exit()
-        else:
-            print("\t ...finished")
+    else:
+    #################### fDOG assembly computation for all species #############
+        for asName in assembly_names:
+            if searchBool == True:
+                break
+            if searchTaxon != '' and searchBool == False:
+                asName = searchTaxon
+                searchBool = True
 
-    ################### search for candidate regions and extract seq ###########
-    # parse blast and filter for candiate regions
-        regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+            ################### path definitions ###################################
 
-        if regions == 0:
-            #no candidat region are available, no ortholog can be found
-            print("No candidate region found!\n")
-            if refBool == True:
-                continue
+            cmd = 'mkdir ' + out + '/tmp/' + asName
+            starting_subprocess(cmd, 'silent')
+            tmp_path = out + "tmp/" + asName + "/"
+            candidatesOutFile = tmp_path + group + ".candidates.fa"
+            if searchTaxon != '':
+                orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
+                fasOutFile = out + "/" + group + "_" + asName
+                mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
             else:
-                taxa = [fdog_ref_species]
-                reciprocal_sequences = 0
-        else:
-            print(str(number_regions) + " candiate regions were found.\n")
-            extract_seq(regions, db_path, tmp_path, mode)
-
-    ############### make Augustus PPX search ###################################
+                orthologsOutFile = out + "/" + group + ".extended.fa"
+                fasOutFile = out + "/" + group
+                mappingFile = out + "/tmp/" + group + ".mapping.txt"
+
+            print("Searching in species " + asName + "\n")
+            assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
+            db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
+
+        ######################## tBLASTn ###########################################
+            #checks if data base exists already
+            db_check = searching_for_db(db_path)
+            if db_check == 0:
+                print("Creating a blast data base...")
+                cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+                starting_subprocess(cmd, mode)
+                print("\t ...finished \n")
+
+            #makes a tBLASTn search against database
+            #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+            print("Starting tBLASTn search...")
+            cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+            exit_code = starting_subprocess(cmd, mode, 3600)
+            if exit_code == 1:
+                print("The tblastn search takes too long. Exciting ...")
+                f.close()
+                cleanup(tmp, tmp_folder)
+                sys.exit()
+            else:
+                print("\t ...finished")
 
-            print("Starting augustus ppx ...")
-            augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-            print("\t ...finished \n")
+        ################### search for candidate regions and extract seq ###########
+        # parse blast and filter for candiate regions
+            regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
 
-    ################# backward search to filter for orthologs###################
-            if int(os.path.getsize(candidatesOutFile)) <= 0:
-                print("No genes found at candidate region\n")
-                if searchTaxon == '' and refBool == True:
+            if regions == 0:
+                #no candidat region are available, no ortholog can be found
+                print("No candidate region found!\n")
+                if refBool == True:
                     continue
                 else:
-                    reciprocal_sequences = 0
                     taxa = [fdog_ref_species]
+                    reciprocal_sequences = 0
             else:
-                reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+                print(str(number_regions) + " candiate regions were found.\n")
+                extract_seq(regions, db_path, tmp_path, mode)
+
+        ############### make Augustus PPX search ###################################
+
+                print("Starting augustus ppx ...")
+                augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+                print("\t ...finished \n")
+
+        ################# backward search to filter for orthologs###################
+                if int(os.path.getsize(candidatesOutFile)) <= 0:
+                    print("No genes found at candidate region\n")
+                    if searchTaxon == '' and refBool == True:
+                        continue
+                    else:
+                        reciprocal_sequences = 0
+                        taxa = [fdog_ref_species]
+                else:
+                    reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
 
-    ################## checking accepted genes for co-orthologs ################
-        if reciprocal_sequences == 0:
-            if regions != 0:
-                print("No ortholog fulfilled the reciprocity criteria \n")
-            if searchTaxon == '' and refBool == True:
-                continue
+        ################## checking accepted genes for co-orthologs ################
+            if reciprocal_sequences == 0:
+                if regions != 0:
+                    print("No ortholog fulfilled the reciprocity criteria \n")
+                if searchTaxon == '' and refBool == True:
+                    continue
+                else:
+                    reciprocal_sequences = 0
             else:
-                reciprocal_sequences = 0
-        else:
-            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+                reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
-    ################ add sequences to extended.fa in the output folder##########
+        ################ add sequences to extended.fa in the output folder##########
 
-        addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
-        refBool = True
+            addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
+            refBool = True
 
-    ############### make Annotation with FAS ###################################
-        # if we want to search in only one Taxon
-        if searchTaxon != '' and fasoff == False:
-            fas = time.time()
-            print("Calculating FAS scores ...")
-            fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
-            # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-            cmd = 'mkdir ' + tmp_path + 'anno_dir'
-            starting_subprocess(cmd, 'silent')
-            cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
-            starting_subprocess(cmd, 'silent')
-            clean_fas(fasOutFile + "_forward.domains", 'domains')
-            clean_fas(fasOutFile + "_reverse.domains", 'domains')
-            clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
-            print("\t ...finished \n")
+        ############### make Annotation with FAS ###################################
+            # if we want to search in only one Taxon
+            if searchTaxon != '' and fasoff == False:
+                fas = time.time()
+                print("Calculating FAS scores ...")
+                fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
+                # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
+                cmd = 'mkdir ' + tmp_path + 'anno_dir'
+                starting_subprocess(cmd, 'silent')
+                cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
+                starting_subprocess(cmd, 'silent')
+                clean_fas(fasOutFile + "_forward.domains", 'domains')
+                clean_fas(fasOutFile + "_reverse.domains", 'domains')
+                clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
+                print("\t ...finished \n")
 
 
     #if we searched in more than one Taxon and no ortholog was found

From ee3636413a9a826d523229d21c9d4e5b88113fe3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 29 Sep 2021 16:10:44 +0200
Subject: [PATCH 123/192] added parallelization with bib multiprocessing

---
 fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 37b7095..aadb3f0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -480,8 +480,9 @@ def addSeq(output, seq_list):
     output_file = open(output, "a+")
 
     for item in seq_list:
-        candidate_fasta = item[0]
-        sequenceIds = item[1]
+        print(item)
+        candidate_fasta = item[1]
+        sequenceIds = item[0]
         if sequenceIds == 0 or sequenceIds == []:
             pass
         seq_records_candidate = readFasta(candidate_fasta)
@@ -613,7 +614,8 @@ def clean_fas(path, file_type):
         file.write(new_line)
     file.close()
 
-def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs):
+def ortholog_search(args):
+    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args
     cmd = 'mkdir ' + out + '/tmp/' + asName
     starting_subprocess(cmd, 'silent')
     tmp_path = out + "tmp/" + asName + "/"
@@ -628,23 +630,23 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci
     db_check = searching_for_db(db_path)
 
     if db_check == 0:
-        print("Creating a blast data base...")
+        #print("Creating a blast data base...")
         cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
         starting_subprocess(cmd, mode)
-        print("\t ...finished \n")
+        #print("\t ...finished \n")
 
     #makes a tBLASTn search against database
     #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-    print("Starting tBLASTn search...")
+    #print("Starting tBLASTn search...")
     cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
     exit_code = starting_subprocess(cmd, mode, 3600)
     if exit_code == 1:
-        print("The tblastn search takes too long. Exciting ...")
+        print("The tblastn search takes too long for species %s. Exciting ..." % asName)
         f.close()
         cleanup(tmp, tmp_folder)
         sys.exit()
-    else:
-        print("\t ...finished")
+    #else:
+        #print("\t ...finished")
 
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
     if regions == 0:
@@ -657,13 +659,13 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci
         extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
-    print("Starting augustus ppx ...")
+    #print("Starting augustus ppx ...")
     augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-    print("\t ...finished \n")
+    #print("\t ...finished \n")
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
-        print("No genes found at candidate regions\n")
+        #print("No genes found at candidate regions\n")
         return [], candidatesOutFile
 
     reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
@@ -910,15 +912,20 @@ def main():
 
     if searchTaxon == '':
         ortholog_sequences = []
+        calls = []
         cpus = mp.cpu_count()
-        print(cpus)
-        #pool = mp.Pool(cpus)
+        pool = mp.Pool(cpus)
         for asName in assembly_names:
-            reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs)
-            ortholog_sequences.append([candidatesOutFile, reciprocal_sequences])
-
+            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
+        #for asName in assembly_names:
+            #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs)
+            #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences])
+        results = (pool.imap_unordered(ortholog_search, calls))
+        pool.close()
+        pool.join()
         orthologsOutFile = out + "/" + group + ".extended.fa"
-
+        for i in results:
+            ortholog_sequences.append(i)
         if taxa == []:
             taxa = [fdog_ref_species]
         addRef(orthologsOutFile, fasta_path, taxa)

From da8cdcc67d7ae8306c73deb4421e0d2b9078689a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 1 Oct 2021 10:57:19 +0200
Subject: [PATCH 124/192] added output for computational time

---
 fdog/fDOGassembly.py | 81 +++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index aadb3f0..97ec269 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -341,7 +341,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
         try:
             id_ref = seedDic[fdog_ref_species]
         except KeyError:
-            print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
+            #print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
             return 0, seed
         if searchTool == "blast":
             cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
@@ -397,7 +397,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
 
 
         if orthologs == []:
-            print("No hit in the backward search, ...exciting")
+            #print("No hit in the backward search, ...exciting")
             return 0, seed
 
     else:
@@ -422,12 +422,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
         orthologs = set({})
 
         for species in seed:
-            print("backward search in species " + species + "\n")
+            print("backward search in species %s\n" %species)
             orthologs_new = set({})
             try:
                 id_ref = seedDic[species]
             except KeyError:
-                print("The species " + species + " isn't part of the core ortholog group, ... exciting")
+                #print("The species " + species + " isn't part of the core ortholog group, ... exciting")
                 return 0, seed
 
             cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
@@ -450,12 +450,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
 
             #print(species)
             #print(orthologs_new)
+            #print(orthologs)
             if species == fdog_ref_species:
                 orthologs = orthologs_new
             else:
                 orthologs = orthologs & orthologs_new
-                if orthologs == {}:
-                    print("No ortholog was found with option --strict")
+                if len(orthologs) == 0:
+                    #print("No ortholog was found with option --strict")
                     return 0, seed
 
 
@@ -465,7 +466,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     return list(orthologs), seed
 
 def addRef(output, core_fasta, species_list):
-    print(species_list)
+    #print(species_list)
     output_file = open(output, "a+")
     seq_records_core = readFasta(core_fasta)
     seq_records_core = list(seq_records_core)
@@ -480,7 +481,7 @@ def addSeq(output, seq_list):
     output_file = open(output, "a+")
 
     for item in seq_list:
-        print(item)
+        #print(item)
         candidate_fasta = item[1]
         sequenceIds = item[0]
         if sequenceIds == 0 or sequenceIds == []:
@@ -540,8 +541,12 @@ def createFasInput(orthologsOutFile, mappingFile):
 
 def cleanup(tmp, tmp_path):
     if tmp == False:
+        timeout = time.time() + 60*1
         while os.path.exists(tmp_path):
             shutil.rmtree(tmp_path, ignore_errors=True)
+            if time.time() > timeout:
+                print("tmp folder could not be removed!")
+                break
 
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
@@ -639,7 +644,10 @@ def ortholog_search(args):
     #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
     #print("Starting tBLASTn search...")
     cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+    time_tblastn_start = time.time()
     exit_code = starting_subprocess(cmd, mode, 3600)
+    time_tblastn_end = time.time()
+    time_tblastn = time_tblastn_end - time_tblastn_start
     if exit_code == 1:
         print("The tblastn search takes too long for species %s. Exciting ..." % asName)
         f.close()
@@ -647,6 +655,7 @@ def ortholog_search(args):
         sys.exit()
     #else:
         #print("\t ...finished")
+    print("Time tblastn %s in species %s" % (str(time_tblastn), asName))
 
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
     if regions == 0:
@@ -655,13 +664,17 @@ def ortholog_search(args):
         return [], candidatesOutFile
 
     else:
-        print(str(number_regions) + " candiate regions were found for species %s.\n" % asName)
+        print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
         extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
     #print("Starting augustus ppx ...")
+    time_augustus_start = time.time()
     augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
     #print("\t ...finished \n")
+    time_augustus_end = time.time()
+    time_augustus = time_augustus_end - time_augustus_start
+    print("Time augustus: %s species %s \n" % (str(time_augustus), asName))
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
@@ -884,7 +897,7 @@ def main():
     print("fDOG reference species: " + fdog_ref_species + " \n")
 
     ######################## consensus sequence ################################
-
+    group_computation_time_start = time.time()
     #make a majority-rule consensus sequence with the tool hmmemit from hmmer
     print("Building a consensus sequence")
     cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
@@ -908,24 +921,35 @@ def main():
         starting_subprocess(cmd, 'silent')
         print(" \t ...finished \n")
 
+    group_computation_time_end = time.time()
+    time_group = group_computation_time_end - group_computation_time_start
+
     searchBool = False
 
     if searchTaxon == '':
         ortholog_sequences = []
-        calls = []
-        cpus = mp.cpu_count()
-        pool = mp.Pool(cpus)
-        for asName in assembly_names:
-            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
-        #for asName in assembly_names:
-            #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs)
-            #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences])
-        results = (pool.imap_unordered(ortholog_search, calls))
-        pool.close()
-        pool.join()
+        time_ortholog_start = time.time()
+        if parallel == True:
+            calls = []
+            cpus = mp.cpu_count()
+            pool = mp.Pool(cpus)
+            for asName in assembly_names:
+                calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
+
+            results = (pool.imap_unordered(ortholog_search, calls))
+            pool.close()
+            pool.join()
+            for i in results:
+                ortholog_sequences.append(i)
+        else:
+            for asName in assembly_names:
+                args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
+                reciprocal_sequences, candidatesOutFile = ortholog_search(args)
+                ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
+
         orthologsOutFile = out + "/" + group + ".extended.fa"
-        for i in results:
-            ortholog_sequences.append(i)
+        time_ortholog_end = time.time()
+        time_ortholog = time_ortholog_end - time_ortholog_start
         if taxa == []:
             taxa = [fdog_ref_species]
         addRef(orthologsOutFile, fasta_path, taxa)
@@ -1071,6 +1095,11 @@ def main():
         clean_fas(out + group + ".phyloprofile", 'phyloprofile')
         print("\t ...finished \n")
     ################# remove tmp folder ########################################
+    end = time.time()
+    time_fas = end - fas
+    print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
+    print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
+    sys.stdout = sys.__stdout__
     if searchTaxon != '':
         f.close()
         cleanup(tmp, tmp_folder)
@@ -1078,11 +1107,9 @@ def main():
         f.close()
         cleanup(tmp, tmp_folder)
 
-    end = time.time()
-    sys.stdout = sys.__stdout__
+
     #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
-    print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
-    f.close()
+
 
 
 if __name__ == '__main__':

From ba752aa04f5ccf706982b3647499396c0064137d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 11 Oct 2021 13:29:27 +0200
Subject: [PATCH 125/192] updated fDOG-Assembly structure. fDOG-Assembly is now
 a separate script and can only be started with the command fdog.assembly

---
 fdog/bin/oneSeq.pl          | 125 +++-----------------
 fdog/fDOGassembly.py        | 223 ++++++++----------------------------
 fdog/mergeAssemblyOutput.py | 124 --------------------
 fdog/runMulti.py            |  31 +----
 fdog/runSingle.py           |  64 +----------
 5 files changed, 74 insertions(+), 493 deletions(-)
 delete mode 100644 fdog/mergeAssemblyOutput.py

diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 1b0839f..a99e1e6 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -207,7 +207,6 @@
 my $idx_dir = "$path/taxonomy/";
 my $dataDir = $path . '/data';
 my $weightPath = "$path/weight_dir/";
-my $assembly_dir = "$path/assembly_dir/";
 
 my @defaultRanks = (
 	'superkingdom', 'kingdom',
@@ -312,15 +311,6 @@
 my %hashTree;
 my $aln = 'muscle';
 my $searchTaxa;
-#variables for fdog_goes_assembly
-my $assembly;
-my $augustusRefSpec;
-my $avIntron;
-my $lengthExtension;
-my $assemblyPath;
-my $searchTool = 'blast';
-my $matrix = 'blosum62';
-my $dataPath = '';
 ################# Command line options
 GetOptions (
 	"h"                 => \$help,
@@ -383,15 +373,7 @@
 	"distDeviation=s"	=> \$distDeviation,
 	"aligner=s"	=> \$aln,
 	"hyperthread" => \$hyperthread,
-	"searchTaxa=s" => \$searchTaxa,
-	"assembly" => \$assembly,
-	"assemblypath=s" => \$assemblyPath,
-	"augustusRefSpec=s" => \$augustusRefSpec,
-	"avIntron=s" => \$avIntron,
-	"lengthExtension=s" => \$lengthExtension,
-	"searchTool=s" => \$searchTool,
-	"scoringmatrix=s" => \$matrix,
-	"dataPath=s" => \$dataPath
+	"searchTaxa=s" => \$searchTaxa
 );
 
 $outputPath = abs_path($outputPath);
@@ -403,8 +385,6 @@
 $weightPath = abs_path($weightPath)."/";
 $genome_dir = abs_path($genome_dir)."/";
 $taxaPath = $genome_dir;
-$dataPath = abs_path($dataPath)."/";
-$assembly_dir = abs_path($assemblyPath)."/";
 
 ############# do initial check
 if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) {
@@ -414,7 +394,7 @@
 		initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff);
 	}
 
-	if (!defined $coreex && !defined $assembly) {
+	if (!defined $coreex) {
 		if (!grep(/$minDist/, @defaultRanks)) {
 			die "ERROR: minDist $minDist invalid!\n";
 		}
@@ -498,7 +478,7 @@
 
 # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction)
 # get annotations for seed sequence if fas support is on
-if ($fas_support && !$assembly){
+if ($fas_support){
 	if (!$weightPath) {
 		createWeightFolder();
 	}
@@ -507,7 +487,7 @@
 
 my $coreStTime = gettime(); #time;
 #core-ortholog search
-if (!$coreex && !$assembly) {
+if (!$coreex) {
 	print "\nCore compiling...\n";
 	$coremode = 1;
 	$taxaPath = $blastPath;
@@ -645,12 +625,7 @@
 	my $final_eval_blast = $eval_blast*$eval_relaxfac;
 	my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac;
 
-	if (!$assembly){
-		$taxaPath = $genome_dir;
-	}
-	else{
-		$taxaPath = $assembly_dir;
-	}
+	$taxaPath = $genome_dir;
 	my @searchTaxa;
 	unless ($searchTaxa) {
 		unless($groupNode) {
@@ -706,63 +681,7 @@
 				}
 			}
 		}
-		if ($assembly){
-			$eval_blast = sprintf("%f", $eval_blast);
-			if ($seqFile ne "") {
-				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
-
-				if (defined $assemblyPath){
-					push(@assembly_cmd, "--assemblyPath $assemblyPath")
-				}
-				if (defined $avIntron){
-					push(@assembly_cmd, "--avIntron $avIntron ");
-				}
-				if (defined $lengthExtension){
-					push(@assembly_cmd, "--lengthExtension $lengthExtension ");
-				}
-				if (!$autoclean){
-					push(@assembly_cmd, "--tmp ");
-				}
-				if ($outputPath){
-					push(@assembly_cmd, "--out $outputPath ");
-				}
-				if (defined $strict){
-					push(@assembly_cmd, "--strict");
-				}
-				if ($eval_blast){
-					push(@assembly_cmd, "--evalBlast $eval_blast ");
-				}
-				if ($searchTool){
-					push(@assembly_cmd, "--msaTool $aln ");
-				}
-				if (defined $checkcoorthologsref){
-					push(@assembly_cmd, "--checkCoorthologsRef");
-				}
-				if ($searchTool){
-					push(@assembly_cmd, "--searchTool $searchTool");
-				}
-				if ($matrix){
-					push(@assembly_cmd, "--scoringmatrix $matrix");
-				}
-				if ($coreOrthologsPath){
-					push(@assembly_cmd, "--coregroupPath $coreOrthologsPath");
-				}
-				if ($fasoff){
-					push(@assembly_cmd, "--fasoff");
-				}
-				if ($searchTaxon){
-					push(@assembly_cmd, "--searchTaxon $searchTaxon");
-				}
-				if ($filter){
-					push(@assembly_cmd, "--filter $filter");
-				}
-				printDebug(@assembly_cmd);
-				system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n";
-			}
-		}
-		else{
 		runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln);
-		}
 		$pm->finish;
 	}
 	$pm->wait_all_children;
@@ -774,8 +693,8 @@
 push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!";
 print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n";
 
-
-if(!$coreOnly && !$assembly){
+## Evaluation of all orthologs that are predicted by the final run
+if(!$coreOnly){
 	my $fasStTime = gettime();
 	my $processID = $$;
 
@@ -787,7 +706,7 @@
 	addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
 
 	# calculate FAS scores for final extended.fa
-	if ($fas_support && !$assembly) {
+	if ($fas_support) {
 		print "Starting the feature architecture similarity score computation...\n";
 		my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno";
 		unless ($countercheck) {
@@ -800,21 +719,12 @@
 	}
 	push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
 	print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
-
 	if($autoclean){
 		print "Cleaning up...\n";
 		runAutoCleanUp($processID);
 	}
 }
 
-if ($assembly){
-	my $file_assembly_out;
-	$file_assembly_out = $outputPath . '/' . $seqName;
-	my $cmd_merge;
-	$cmd_merge = "fdog.mergeAssembly --in  $outputPath --out  $file_assembly_out --cleanup";
-	printDebug($cmd_merge);
-	system($cmd_merge);
-}
 ## Delete tmp folder
 unless ($debug) {
 	my $delTmp = "rm -rf $tmpdir";
@@ -1224,10 +1134,10 @@ sub checkOptions {
 	if ($force == 1 and $append ==1) {
 		$force = 0;
 	}
-	### check the presence of the pre-computed core set if options reuseCore or assembly is used
-	if ($coreex || $assembly) {
+	### check the presence of the pre-computed core set
+	if ($coreex) {
 		if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") {
-			print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
+			print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
 			exit;
 		}
 	}
@@ -1298,7 +1208,7 @@ sub checkOptions {
 
 	### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected
 	$optbreaker = 0;
-	while(!$minCoreOrthologs and (!$coreex and !$assembly)) {
+	while(!$minCoreOrthologs and !$coreex) {
 		if ($optbreaker >= 3){
 			print "No proper number given ... exiting.\n";
 			exit;
@@ -1313,12 +1223,10 @@ sub checkOptions {
 		$filter = 'no' if $filter eq 'F';
 	}
 
-	if (!$assembly){
-		$inputSeq = fetchSequence($seqFile, $dataDir);
-	}
+	$inputSeq = fetchSequence($seqFile, $dataDir);
 
 	## the user has not provided a sequence id, however, the refspec is determined.
-	if($seqId eq '' && !$assembly) {
+	if($seqId eq '') {
 		my $besthit;
 		if (!$blast){
 			## a refspec has been determined
@@ -1445,9 +1353,8 @@ sub checkOptions {
 	#### checking for the min and max distance for the core set compilation
 	#### omit this check, if the option reuseCore has been selected (added 2019-02-04)
 	$optbreaker = 0;
-	if (!$coreex and !$assembly) {
+	if (!$coreex) {
 		my $node;
-		#print "Testing coreex assembly\n";
 		$node = $db->get_taxon(-taxonid => $refTaxa{$refSpec});
 		$node->name('supplied', $refSpec);
 		if (lc($maxDist) eq "root"){
@@ -2709,7 +2616,7 @@ sub initialCheck {
 		}
 	}
 	# check weight_dir
-	if ($fasoff != 1 && !$assembly) {
+	if ($fasoff != 1) {
 		my %seen;
 		my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir);
 		my @notFolder;
diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 97ec269..eb9dc41 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -737,11 +737,11 @@ def main():
     optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False)
     optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
-    optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='')
+    optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[])
     optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
     optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
     optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
-    optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
+    optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[])
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
     optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
@@ -776,12 +776,8 @@ def main():
     msaTool = args.msaTool
     matrix = args.scoringmatrix
     taxa = args.coreTaxa
-    if taxa == '':
-        taxa =[]
-    else:
-        taxa = taxa.split(",")
     fasoff = args.fasoff
-    searchTaxon = args.searchTaxon
+    searchTaxa = args.searchTaxa
     silent = args.silent
     debug = args.debug
     force = args.force
@@ -816,7 +812,6 @@ def main():
             except:
                 dataPath = 'config'
 
-
     if out == '':
         out = os.getcwd()
     else:
@@ -854,7 +849,6 @@ def main():
         assemblyDir = dataPath + '/assembly_dir/'
     check_path(assemblyDir)
 
-
     try:
         f = open(out + "/fdog.log", "a+")
     except FileNotFoundError:
@@ -869,8 +863,15 @@ def main():
         sys.stdout = Logger(f)
 
     ########################### other variables ################################
-
-    assembly_names = os.listdir(assemblyDir)
+    if searchTaxa == []:
+        assembly_names = os.listdir(assemblyDir)
+    else:
+        assembly_names = os.listdir(assemblyDir)
+        for Taxon in searchTaxa:
+            if Taxon not in assembly_names:
+                print("Taxon %s is not in the assembly_dir" % Taxon)
+                sys.exit()
+        assembly_names = searchTaxa
 
     ################################# paths ####################################
 
@@ -924,170 +925,48 @@ def main():
     group_computation_time_end = time.time()
     time_group = group_computation_time_end - group_computation_time_start
 
-    searchBool = False
-
-    if searchTaxon == '':
-        ortholog_sequences = []
-        time_ortholog_start = time.time()
-        if parallel == True:
-            calls = []
-            cpus = mp.cpu_count()
-            pool = mp.Pool(cpus)
-            for asName in assembly_names:
-                calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
-
-            results = (pool.imap_unordered(ortholog_search, calls))
-            pool.close()
-            pool.join()
-            for i in results:
-                ortholog_sequences.append(i)
-        else:
-            for asName in assembly_names:
-                args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
-                reciprocal_sequences, candidatesOutFile = ortholog_search(args)
-                ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
-
-        orthologsOutFile = out + "/" + group + ".extended.fa"
-        time_ortholog_end = time.time()
-        time_ortholog = time_ortholog_end - time_ortholog_start
-        if taxa == []:
-            taxa = [fdog_ref_species]
-        addRef(orthologsOutFile, fasta_path, taxa)
-        addSeq(orthologsOutFile, ortholog_sequences)
-        refBool = True
-        mappingFile = out + "/tmp/" + group + ".mapping.txt"
+    ###################### ortholog search #####################################
 
+    ortholog_sequences = []
+    time_ortholog_start = time.time()
+    if parallel == True:
+        ##################### parallel compuataion #############################
+        calls = []
+        cpus = mp.cpu_count()
+        pool = mp.Pool(cpus)
+        for asName in assembly_names:
+            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
 
+        results = (pool.imap_unordered(ortholog_search, calls))
+        pool.close()
+        pool.join()
+        for i in results:
+            ortholog_sequences.append(i)
     else:
-    #################### fDOG assembly computation for all species #############
+        ###################### computation species per species ################
         for asName in assembly_names:
-            if searchBool == True:
-                break
-            if searchTaxon != '' and searchBool == False:
-                asName = searchTaxon
-                searchBool = True
-
-            ################### path definitions ###################################
-
-            cmd = 'mkdir ' + out + '/tmp/' + asName
-            starting_subprocess(cmd, 'silent')
-            tmp_path = out + "tmp/" + asName + "/"
-            candidatesOutFile = tmp_path + group + ".candidates.fa"
-            if searchTaxon != '':
-                orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
-                fasOutFile = out + "/" + group + "_" + asName
-                mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
-            else:
-                orthologsOutFile = out + "/" + group + ".extended.fa"
-                fasOutFile = out + "/" + group
-                mappingFile = out + "/tmp/" + group + ".mapping.txt"
-
-            print("Searching in species " + asName + "\n")
-            assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
-            db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-
-        ######################## tBLASTn ###########################################
-            #checks if data base exists already
-            db_check = searching_for_db(db_path)
-            if db_check == 0:
-                print("Creating a blast data base...")
-                cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
-                starting_subprocess(cmd, mode)
-                print("\t ...finished \n")
-
-            #makes a tBLASTn search against database
-            #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-            print("Starting tBLASTn search...")
-            cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
-            exit_code = starting_subprocess(cmd, mode, 3600)
-            if exit_code == 1:
-                print("The tblastn search takes too long. Exciting ...")
-                f.close()
-                cleanup(tmp, tmp_folder)
-                sys.exit()
-            else:
-                print("\t ...finished")
-
-        ################### search for candidate regions and extract seq ###########
-        # parse blast and filter for candiate regions
-            regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
-
-            if regions == 0:
-                #no candidat region are available, no ortholog can be found
-                print("No candidate region found!\n")
-                if refBool == True:
-                    continue
-                else:
-                    taxa = [fdog_ref_species]
-                    reciprocal_sequences = 0
-            else:
-                print(str(number_regions) + " candiate regions were found.\n")
-                extract_seq(regions, db_path, tmp_path, mode)
-
-        ############### make Augustus PPX search ###################################
-
-                print("Starting augustus ppx ...")
-                augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-                print("\t ...finished \n")
-
-        ################# backward search to filter for orthologs###################
-                if int(os.path.getsize(candidatesOutFile)) <= 0:
-                    print("No genes found at candidate region\n")
-                    if searchTaxon == '' and refBool == True:
-                        continue
-                    else:
-                        reciprocal_sequences = 0
-                        taxa = [fdog_ref_species]
-                else:
-                    reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
-
-
-        ################## checking accepted genes for co-orthologs ################
-            if reciprocal_sequences == 0:
-                if regions != 0:
-                    print("No ortholog fulfilled the reciprocity criteria \n")
-                if searchTaxon == '' and refBool == True:
-                    continue
-                else:
-                    reciprocal_sequences = 0
-            else:
-                reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
-
-        ################ add sequences to extended.fa in the output folder##########
-
-            addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
-            refBool = True
+            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
+            reciprocal_sequences, candidatesOutFile = ortholog_search(args)
+            ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
+
+    ################## preparing output ########################################
+    orthologsOutFile = out + "/" + group + ".extended.fa"
+    time_ortholog_end = time.time()
+    time_ortholog = time_ortholog_end - time_ortholog_start
+    if taxa == []:
+        taxa = [fdog_ref_species]
+    if append == True:
+        addSeq(orthologsOutFile, ortholog_sequences)
+    else:
+        addRef(orthologsOutFile, fasta_path, taxa)
+        addSeq(orthologsOutFile, ortholog_sequences)
+    mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-        ############### make Annotation with FAS ###################################
-            # if we want to search in only one Taxon
-            if searchTaxon != '' and fasoff == False:
-                fas = time.time()
-                print("Calculating FAS scores ...")
-                fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
-                # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-                cmd = 'mkdir ' + tmp_path + 'anno_dir'
-                starting_subprocess(cmd, 'silent')
-                cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
-                starting_subprocess(cmd, 'silent')
-                clean_fas(fasOutFile + "_forward.domains", 'domains')
-                clean_fas(fasOutFile + "_reverse.domains", 'domains')
-                clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
-                print("\t ...finished \n")
-
-
-    #if we searched in more than one Taxon and no ortholog was found
-    if refBool == False and searchTaxon == '':
-        print("No orthologs found. Exciting ...")
-        f.close()
-        cleanup(tmp, tmp_folder)
-        return 1
-    #if we searched in more than one taxon
-    if fasoff == False and searchTaxon == '':
+    if fasoff == False:
         fas = time.time()
         print("Calculating FAS scores ...")
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
-        # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
         cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, 'silent')
         clean_fas(out + group + "_forward.domains", 'domains')
@@ -1100,17 +979,9 @@ def main():
     print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
     print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
     sys.stdout = sys.__stdout__
-    if searchTaxon != '':
-        f.close()
-        cleanup(tmp, tmp_folder)
-    else:
-        f.close()
-        cleanup(tmp, tmp_folder)
-
-
-    #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
-
 
+    f.close()
+    cleanup(tmp, tmp_folder)
 
 if __name__ == '__main__':
     main()
diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
deleted file mode 100644
index 1606b1d..0000000
--- a/fdog/mergeAssemblyOutput.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# -*- coding: utf-8 -*-
-
-#######################################################################
-# Copyright (C) 2020 Vinh Tran
-#
-#  This script is used to merge all output files (.extended.fa, .phyloprofile,
-#  _forward.domains, _reverse.domains) in a given directory into one file each.
-#
-#  This script is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License <http://www.gnu.org/licenses/> for
-#  more details
-#
-#  Contact: hannah.muelbaier@stud.uni-frankfurt.de
-#
-#######################################################################
-
-import sys
-import os
-from os import listdir as ldir
-import argparse
-from pathlib import Path
-
-def main():
-    version = '0.0.1'
-    parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.')
-    parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
-                        action='store', default='', required=True)
-    parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True)
-    parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False)
-    args = parser.parse_args()
-
-    directory = args.input
-    out = args.output
-    cleanup = args.cleanup
-    if not os.path.exists(os.path.abspath(directory)):
-        sys.exit('%s not found' % directory)
-    else:
-        directory = os.path.abspath(directory)
-
-    phyloprofile = None
-    set_phylo = set()
-    domains_0 = None
-    set_domains_f = set()
-    domains_1 = None
-    set_domains_r = set()
-    ex_fasta = None
-    set_fasta = set()
-    header_bool = False
-    for infile in ldir(directory):
-        if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile':
-            if not phyloprofile:
-                phyloprofile = open(out + '.phyloprofile', 'w')
-                phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo:
-                        phyloprofile.write(line)
-                if len(lines) > 1:
-                    set_phylo = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains':
-            if not domains_0:
-                domains_0 = open(out + '_forward.domains', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line not in set_domains_f:
-                        domains_0.write(line)
-                if len(lines) > 1:
-                    set_domains_f = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains':
-            if not domains_1:
-                domains_1 = open(out + '_reverse.domains', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line not in set_domains_r:
-                        domains_1.write(line)
-                if len(lines) > 1:
-                    set_domains_r = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa':
-            if not ex_fasta:
-                ex_fasta = open(out + '.extended.fa', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                header = set()
-                #print(set_fasta)
-                for line in lines:
-                    if line[0] == ">":
-                        header.add(line)
-                        if line not in set_fasta:
-                            ex_fasta.write(line)
-                            header_bool = True
-                        else:
-                            header_bool = False
-                    else:
-                        if header_bool == True:
-                            ex_fasta.write(line)
-                set_fasta = header
-            if cleanup == True:
-                os.remove(directory + '/' +infile)
-        elif infile.endswith('.tsv'):
-            os.remove(directory + '/' + infile)
-
-    if phyloprofile:
-        phyloprofile.close()
-    if domains_0:
-        domains_0.close()
-    if domains_1:
-        domains_1.close()
-    if ex_fasta:
-        ex_fasta.close()
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/fdog/runMulti.py b/fdog/runMulti.py
index c19b598..c19b0ff 100644
--- a/fdog/runMulti.py
+++ b/fdog/runMulti.py
@@ -48,8 +48,7 @@ def prepare(args, step):
     coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
     fasoff, countercheck, coreFilter, minScore,
     strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-    cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args
-
+    cpu, hyperthread, checkOff, debug, silent) = args
 
     mute = False
     if step == 'core':
@@ -71,9 +70,7 @@ def prepare(args, step):
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
     otherArgs = [cpu, hyperthread, checkOff, debug, True]
-    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
-    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute)
-
+    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute)
 
 def getSeedName(seedFile):
     seqName = seedFile.split('.')[0]
@@ -108,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-
         if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)):
             (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core')
-            coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute])
+            coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute])
     if len(coreCompilationJobs) > 0:
         pool = mp.Pool(cpu)
         coreOut = []
@@ -133,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
+        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
         if mute == True:
             print(seed)
         else:
@@ -295,14 +291,6 @@ def main():
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
-    assembly_options = parser.add_argument_group('Assembly options')
-    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
-    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
-    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
-    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
-    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
-    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
-    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -379,15 +367,6 @@ def main():
     else:
         silent = True
 
-    #fdog_goes_assembly arguments
-    assembly = args.assembly
-    assemblyFile = args.assemblyFile
-    augustusRefSpec = args.augustusRefSpec
-    avIntron = args.avIntron
-    lengthExtension = args.lengthExtension
-    searchTool = args.searchTool
-    matrix = args.scoringmatrix
-
     ### check fas
     if not fasoff:
         try:
@@ -472,7 +451,7 @@ def main():
                 coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
                 fasoff, countercheck, coreFilter, minScore,
                 strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-                cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
+                cpu, hyperthread, checkOff, debug, silent]
 
     ### START
     Path(outpath).mkdir(parents=True, exist_ok=True)
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index c65300f..c4abb82 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType):
         exit('%s not found' % (fdogPath + '/bin/oneSeq.pl'))
 
 def runSingle(args):
-    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args
+    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args
     # basic command
     (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs
     cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec)
     # add paths
-    (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs
-    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath)
+    (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs
+    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath)
     # add other I/O options
     (append, force, noCleanup, group, blast, db) = ioArgs
     if append == True:
@@ -163,28 +163,7 @@ def runSingle(args):
         cmd = cmd + ' -debug'
     if silent == True:
         cmd = cmd + ' -silent'
-    # add assembly options
-    (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs
-    if assembly == True:
-        cmd = cmd + ' -assembly'
-        cmd = cmd + ' -reuseCore'
-        if not augustusRefSpec == '':
-            cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec
-        else:
-            sys.exit('An augutus reference species is requiered by using the option --assembly')
-        if not avIntron == '':
-            cmd = cmd + ' -avIntron=%s' % avIntron
-        if not lengthExtension == '':
-            cmd = cmd + ' -lengthExtension=%s' % lengthExtension
-        if not assemblyFile == '':
-            cmd = cmd + ' -assemblyFile=%s' % assemblyFile
-        if not searchTool == '':
-            cmd = cmd + ' -searchTool=%s' % searchTool
-        if not matrix == '':
-            cmd = cmd + ' -scoringmatrix=%s' % matrix
-        if not dataPath == '':
-            cmd = cmd + ' -dataPath=%s' % dataPath
-    #print(cmd)
+    # print(cmd)
     if mute == True:
         cmd = cmd + ' > /dev/null 2>&1'
     try:
@@ -211,8 +190,6 @@ def main():
     optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='')
     optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='')
     optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
-    optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='')
-
 
     addtionalIO = parser.add_argument_group('Other I/O options')
     addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
@@ -295,14 +272,6 @@ def main():
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
-    assembly_options = parser.add_argument_group('Assembly options')
-    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
-    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
-    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
-    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
-    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
-    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
-    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -322,7 +291,6 @@ def main():
     searchpath = args.searchpath
     weightpath = args.weightpath
     pathFile = args.pathFile
-    assemblypath = args.assemblypath
 
     # other I/O arguments
     append = args.append
@@ -378,15 +346,6 @@ def main():
     else:
         silent = True
 
-    #fdog_goes_assembly arguments
-    assembly = args.assembly
-    assemblyFile = args.assemblyFile
-    augustusRefSpec = args.augustusRefSpec
-    avIntron = args.avIntron
-    lengthExtension = args.lengthExtension
-    searchTool = args.searchTool
-    matrix = args.scoringmatrix
-
     ### get fdog and data path
     dataPath = ''
     fdogPath = os.path.realpath(__file__).replace('/runSingle.py','')
@@ -434,30 +393,19 @@ def main():
             except:
                 sys.exit('weightpath not found in %s' % pathFile)
 
-    if assemblypath == '':
-        assemblypath = dataPath + '/assembly_dir'
-        if dataPath == 'config':
-            try:
-                assemblypath = cfg['assemblypath']
-            except:
-                sys.exit('assemblypath not found in %s' % pathFile)
-    if assembly == True:
-        searchpath = assemblypath
-
     ### check input arguments
     seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath])
     # group arguments
     basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth]
     ioArgs = [append, force, noCleanup, group, blast, db]
-    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath]
+    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath]
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
     otherArgs = [cpu, hyperthread, checkOff, debug, silent]
-    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath]
 
     ### run fdog
-    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False])
+    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False])
 
 if __name__ == '__main__':
     main()

From 49a430b913970276efa3c69af3ca9007f7b5e3c9 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 11 Oct 2021 15:56:40 +0200
Subject: [PATCH 126/192] testing addSeq function

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index eb9dc41..3940f04 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -484,6 +484,8 @@ def addSeq(output, seq_list):
         #print(item)
         candidate_fasta = item[1]
         sequenceIds = item[0]
+        print(sequenceIds)
+        print(type(sequenceIds))
         if sequenceIds == 0 or sequenceIds == []:
             pass
         seq_records_candidate = readFasta(candidate_fasta)

From e18872b31a0c5b013a2c7bdece18674c3b8c5974 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 11 Oct 2021 16:02:13 +0200
Subject: [PATCH 127/192] bug fix in addSeq function

---
 fdog/fDOGassembly.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3940f04..71beafc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -484,10 +484,8 @@ def addSeq(output, seq_list):
         #print(item)
         candidate_fasta = item[1]
         sequenceIds = item[0]
-        print(sequenceIds)
-        print(type(sequenceIds))
         if sequenceIds == 0 or sequenceIds == []:
-            pass
+            continue
         seq_records_candidate = readFasta(candidate_fasta)
         seq_records_candidate = list(seq_records_candidate)
         for entry_candidate in seq_records_candidate:

From b4d1e0c3f8fb09ca0214c45ff40789e3b56d64b1 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sat, 16 Oct 2021 14:40:39 +0200
Subject: [PATCH 128/192] bug fix in ortholog search function

---
 fdog/fDOGassembly.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 71beafc..7c45233 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -650,8 +650,7 @@ def ortholog_search(args):
     time_tblastn = time_tblastn_end - time_tblastn_start
     if exit_code == 1:
         print("The tblastn search takes too long for species %s. Exciting ..." % asName)
-        f.close()
-        cleanup(tmp, tmp_folder)
+        #cleanup(tmp, tmp_folder)
         sys.exit()
     #else:
         #print("\t ...finished")

From e85fd1c561df192ebdcd15ddd0c84336baad327f Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Sun, 17 Oct 2021 12:24:31 +0200
Subject: [PATCH 129/192] bug fix in ortholog search if tblastn takes to long

---
 fdog/fDOGassembly.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 7c45233..0a9df8f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -244,7 +244,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                         output.write(line)
                 sequence_file.close()
             except FileNotFoundError:
-                print("No gene found in region with ID:" + name + " , continuing with next region")
+                print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
     output.close()
 
 def searching_for_db(assembly_path):
@@ -315,7 +315,12 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         #print("mafft-linsi")
         os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
 
-    distances = get_distance_biopython(aln_file, matrix)
+    try:
+        distances = get_distance_biopython(aln_file, matrix)
+    except ValueError:
+        print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
+        return 0, "NaN", "NaN"
+
 
     distance_hit_query = distances[best_hit, candidate_name]
     distance_ref_hit = distances[best_hit, ref]
@@ -374,7 +379,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
                                 print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                                 orthologs.append(gene)
                             elif co_orthologs_result == 0:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
+                                if distance_ref_hit != "NaN":
+                                    print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                     else:
                         print("\tnothitting\n") if mode == "debug" else ""
             elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs:
@@ -629,7 +635,7 @@ def ortholog_search(args):
     fasOutFile = out + "/" + group
     #mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-    print("Searching in species " + asName + "\n")
+    sys.stdout.write("Searching in species " + asName + "\n")
     assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
     db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
     db_check = searching_for_db(db_path)
@@ -649,9 +655,10 @@ def ortholog_search(args):
     time_tblastn_end = time.time()
     time_tblastn = time_tblastn_end - time_tblastn_start
     if exit_code == 1:
-        print("The tblastn search takes too long for species %s. Exciting ..." % asName)
+        sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
         #cleanup(tmp, tmp_folder)
-        sys.exit()
+        #sys.exit()
+        return [], candidatesOutFile
     #else:
         #print("\t ...finished")
     print("Time tblastn %s in species %s" % (str(time_tblastn), asName))
@@ -659,7 +666,7 @@ def ortholog_search(args):
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
     if regions == 0:
         #no candidat region are available, no ortholog can be found
-        print("No candidate region found for species %s!\n" % asName)
+        sys.stdout.write("No candidate region found for species %s!\n" % asName)
         return [], candidatesOutFile
 
     else:
@@ -684,7 +691,7 @@ def ortholog_search(args):
 
     if reciprocal_sequences == 0:
         if regions != 0:
-            print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+            sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
         return [], candidatesOutFile
     else:
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
@@ -976,7 +983,7 @@ def main():
     end = time.time()
     time_fas = end - fas
     print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
-    print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
+    print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
     sys.stdout = sys.__stdout__
 
     f.close()

From 1f9f736325253c08f33d60bc787136c10f6ef303 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 18 Oct 2021 10:23:16 +0200
Subject: [PATCH 130/192] updated input options

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 0a9df8f..e40701b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -727,7 +727,7 @@ def main():
     required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
                             action='store', default='', required=True)
     required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
-    required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True)
+    required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True)
     ################## optional arguments ######################################
     optional = parser.add_argument_group('Optional arguments')
     optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int)
@@ -744,10 +744,10 @@ def main():
     optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False)
     optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[])
-    optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
+    #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
     optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
     optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
-    optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[])
+    optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[])
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
     optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)

From 42e4ba122504f24a25f748c087d14aa0d199a419 Mon Sep 17 00:00:00 2001
From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com>
Date: Tue, 19 Oct 2021 10:51:15 +0200
Subject: [PATCH 131/192] Update fDOG goes assembly to version 0.1.2 (#12)

---
 fdog/addTaxa.py             |  15 +-
 fdog/addTaxon.py            |  62 ++--
 fdog/bin/hamstr.pl          | 141 ++++-----
 fdog/bin/oneSeq.pl          | 204 +++++--------
 fdog/checkData.py           |  69 +++--
 fdog/fDOGassembly.py        | 563 ++++++++++++++++++++++--------------
 fdog/mergeAssemblyOutput.py | 124 --------
 fdog/mergeOutput.py         |   7 +-
 fdog/removefDog.py          |   4 +-
 fdog/runMulti.py            |  46 +--
 fdog/runSingle.py           |  74 +----
 fdog/setup/install_lib.sh   |   3 -
 fdog/setup/setup.sh         |  43 +--
 fdog/setup/setup_conda.sh   |  25 +-
 setup.py                    |   4 +-
 15 files changed, 648 insertions(+), 736 deletions(-)
 delete mode 100644 fdog/mergeAssemblyOutput.py

diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py
index d392c8c..fa4a3a1 100644
--- a/fdog/addTaxa.py
+++ b/fdog/addTaxa.py
@@ -37,6 +37,7 @@
 import re
 import shutil
 from tqdm import tqdm
+from datetime import datetime
 
 def checkFileExist(file):
     if not os.path.exists(os.path.abspath(file)):
@@ -68,20 +69,18 @@ def parseMapFile(mappingFile):
             try:
                 ver = tmp[3].strip()
             except:
-                ver = 1
+                ver = datetime.today().strftime('%y%m%d') #1
             # print(taxName+"@"+str(taxId)+"@"+str(ver))
             nameDict[fileName] = (taxName, str(taxId), str(ver))
     return(nameDict)
 
 def runAddTaxon(args):
-    (f,n,i,o,c,v,a,cpus,replace,delete,oldFAS) = args
+    (f,n,i,o,c,v,a,cpus,replace,delete) = args
     cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus)
     if c == True:
         cmd = cmd + ' -c'
     if a == True:
         cmd = cmd + ' -a'
-    if oldFAS == True:
-        cmd = cmd + ' --oldFAS'
     if replace == True:
         cmd = cmd + ' --replace'
     if delete == True:
@@ -95,7 +94,7 @@ def runAddTaxon(args):
         sys.exit('Problem running\n%s' % (cmd))
 
 def main():
-    version = '0.0.5'
+    version = '0.0.9'
     parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.')
     required = parser.add_argument_group('required arguments')
     optional = parser.add_argument_group('optional arguments')
@@ -105,8 +104,7 @@ def main():
                             action='store', default='', required=True)
     optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
     optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
-    optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using annoFAS', action='store_true', default=False)
-    optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False)
+    optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False)
     optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
     optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
     optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
@@ -128,7 +126,6 @@ def main():
     outPath = os.path.abspath(outPath)
     noAnno = args.noAnno
     coreTaxa = args.coreTaxa
-    oldFAS = args.oldFAS
     cpus = args.cpus
     if cpus == 0:
         cpus = mp.cpu_count()-2
@@ -171,7 +168,7 @@ def main():
                 verProt = nameDict[f][2]
                 jobs.append([
                     folIn + '/' + f, nameDict[f][0], nameDict[f][1],
-                    outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete, oldFAS
+                    outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete
                 ])
 
     if len(dupList) > 0:
diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py
index fe0a810..f962cba 100755
--- a/fdog/addTaxon.py
+++ b/fdog/addTaxon.py
@@ -32,6 +32,7 @@
 import multiprocessing as mp
 from ete3 import NCBITaxa
 import re
+import shutil
 from datetime import datetime
 
 def checkFileExist(file):
@@ -83,7 +84,7 @@ def runBlast(args):
         os.symlink(fileInGenome, fileInBlast)
 
 def main():
-    version = '0.0.5'
+    version = '0.0.10'
     parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.')
     required = parser.add_argument_group('required arguments')
     optional = parser.add_argument_group('optional arguments')
@@ -91,10 +92,9 @@ def main():
     required.add_argument('-i', '--taxid', help='Taxonomy ID of input taxon', action='store', default='', required=True, type=int)
     optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
     optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str)
-    optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default=1, type=str)
+    optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str)
     optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
-    optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using annoFAS', action='store_true', default=False)
-    optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False)
+    optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False)
     optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
     optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
     optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
@@ -119,7 +119,8 @@ def main():
     noAnno = args.noAnno
     coreTaxa = args.coreTaxa
     ver = str(args.verProt)
-    oldFAS = args.oldFAS
+    if ver == '':
+        ver = datetime.today().strftime('%y%m%d')
     cpus = args.cpus
     if cpus == 0:
         cpus = mp.cpu_count()-2
@@ -135,6 +136,13 @@ def main():
     specName = name+'@'+taxId+'@'+ver
     print('Species name\t%s' % specName)
 
+    ### remove old folder if force is set
+    if force:
+        if os.path.exists(outPath + '/genome_dir/' + specName):
+            shutil.rmtree(outPath + '/genome_dir/' + specName)
+        if os.path.exists(outPath + '/blast_dir/' + specName):
+            shutil.rmtree(outPath + '/blast_dir/' + specName)
+
     ### create file in genome_dir
     print('Parsing FASTA file...')
     Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True)
@@ -147,25 +155,30 @@ def main():
         f = open(specFile, 'w')
         index = 0
         modIdIndex = 0
-        longId = 'no'
+        # longId = 'no'
         tmpDict = {}
+        # with open(specFile + '.mapping', 'a') as mappingFile:
         for id in inSeq:
             seq = str(inSeq[id].seq)
             # check ID
-            id = re.sub('\|', '_', id)
-            oriId = id
-            if len(id) > 30:
-                modIdIndex = modIdIndex + 1
-                id = specName + "_" + str(modIdIndex)
-                longId = 'yes'
-                with open(specFile + '.mapping', 'a') as mappingFile:
-                    mappingFile.write('%s\t%s\n' % (id, oriId))
-            if not id in tmpDict:
-                tmpDict[id] = 1
+            # oriId = id
+            if ' ' in id:
+                sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id)
             else:
-                index = index + 1
-                id = str(id) + '_' + str(index)
-                tmpDict[id] = 1
+                if '\|' in id:
+                    print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m')
+                    id = re.sub('\|', '_', id)
+            # if len(id) > 20:
+            #     modIdIndex = modIdIndex + 1
+            #     id = modIdIndex
+            #     longId = 'yes'
+            # if not id in tmpDict:
+            #     tmpDict[id] = 1
+            # else:
+            #     index = index + 1
+            #     id = str(index)
+            #     tmpDict[id] = 1
+            # mappingFile.write('%s\t%s\n' % (id, oriId))
             # check seq
             if seq[-1] == '*':
                 seq = seq[:-1]
@@ -187,8 +200,8 @@ def main():
         cf.write(str(datetime.now()))
         cf.close()
         # warning about long header
-        if longId == 'yes':
-            print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile)
+        # if longId == 'yes':
+        #     print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile)
     else:
         print(genomePath + '/' + specName + '.fa already exists!')
 
@@ -207,16 +220,13 @@ def main():
     ### create annotation
     if not noAnno:
         Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True)
-        annoCmd = 'annoFAS -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus)
+        annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus)
         if force:
             annoCmd = annoCmd + " --force"
-        if oldFAS:
-            print("running old version of FAS...")
-            annoCmd = 'annoFAS -i %s/%s.fa -o %s -n %s --cores %s' % (genomePath, specName, outPath+'/weight_dir', specName, cpus)
         try:
             subprocess.call([annoCmd], shell = True)
         except:
-            print('\033[91mProblem with running annoFAS. You can check it with this command:\n%s\033[0m' % annoCmd)
+            print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd)
 
     print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath))
 
diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl
index 7ff125e..3feb01e 100755
--- a/fdog/bin/hamstr.pl
+++ b/fdog/bin/hamstr.pl
@@ -195,9 +195,10 @@
 ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef
 ## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name
 ## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory
+## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm
 
 ######################## start main ###########################################
-my $version = "HaMStR v.13.4.4";
+my $version = "HaMStR v.13.4.5";
 ######################## checking whether the configure script has been run ###
 my $configure = 0;
 if ($configure == 0){
@@ -315,7 +316,7 @@
 my $ublast = 0;
 my $accel = 0.8;
 #####determine the hostname#######
-push @log, "VERSION:\t$version\n";
+# push @log, "VERSION:\t$version\n";
 my $hostname = `hostname`;
 chomp $hostname;
 push @log, "HOSTNAME\t$hostname\n";
@@ -520,7 +521,7 @@
 	exit;
 }
 else {
-	open (OUT, ">$outpath/hamstrsearch.log") or die "could not open logfile\n";
+	open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n";
 	print OUT join "\n", @log;
 	close OUT;
 }
@@ -1059,7 +1060,7 @@ sub checkInput {
 					}
 				}
 			} else {
-				push @log, "\trunning HaMStR with all hmms in $hmm_dir";
+				push @log, "\trunning fDOG with all hmms in $hmm_dir";
 				my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g;
 				@hmms = `ls $hmm_dir_tmp`;
 			}
@@ -1299,10 +1300,10 @@ sub checkInput {
 	}
 	## 14) determin whether or not the -representative flag has been set
 	if (defined $rep) {
-		push @log, "\tHaMStR will run with the -representative option";
+		push @log, "\tfDOG will run with the -representative option";
 	}
 	else {
-		push @log, "\tHaMStR was called without the -representative option. More than one ortholog may be identified per core-ortholog group!";
+		push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!";
 	}
 
 	## check further options
@@ -1854,68 +1855,68 @@ sub revComp {
 	return($seq);
 }
 ##############################
-sub parseHmmer3pm {
-	my ($file, $path) = @_;
-	my $hits;
-	my $query;
-	my %tmphash;
-	if (!defined $path){
-		$path = '.';
-	}
-	$file = $path . '/' . $file;
-	my $in = Bio::SearchIO->new(
-	-format => 'hmmer',
-	-file   => $file
-	);
-	while( my $result = $in->next_result ) {
-		# this is a Bio::Search::Result::HMMERResult object
-		if (!defined $query){
-			$query = $result->query_name();
-			printOUT("query is $query\n");
-		}
-		my $hitcount = 0;
-		while( my $hit = $result->next_hit ) {
-			my $tmp = $hit->name();
-			my $tmpscore = $hit->score();
-			$tmp =~ s/_RF.*//;
-			if (!defined $tmphash{$tmp}){
-				$hits->[$hitcount]->{id} = $tmp;
-				$hits->[$hitcount]->{hmmscore} = $tmpscore;
-				$hitcount++;
-				$tmphash{$tmp}=1;
-				if (defined $bhh){
-					last;
-				}
-			}
-		}
-
-		if (defined $hits->[0]) {
-			####### a quick hack to obtain the lagPhase value
-			my $criticalValue; # takes the value used for candidate discrimination
-			my $hitLimitLoc = $hitlimit;
-			if (defined $autoLimit) {
-				printDebug("Entering getLag Routine\n");
-				## the user has invoked the autmated inference of a hit limit
-				($hitLimitLoc, $criticalValue)  = getLag($hits, $hitcount);
-				if (!defined $criticalValue) {
-					## there was a problem in the computatation of the lagPhase
-					print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n";
-					($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
-				}
-			}
-			elsif (defined $scoreThreshold) {
-				printDebug("entering the scoreThreshold routine");
-				($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
-				printDebug("hitlimitloc is now $hitLimitLoc");
-			}
-
-			return ($query, $hits, $hitLimitLoc, $criticalValue);
-		}
-		else {
-			return ($query);
-		}
-	}
-}
+# sub parseHmmer3pm {
+# 	my ($file, $path) = @_;
+# 	my $hits;
+# 	my $query;
+# 	my %tmphash;
+# 	if (!defined $path){
+# 		$path = '.';
+# 	}
+# 	$file = $path . '/' . $file;
+# 	my $in = Bio::SearchIO->new(
+# 	-format => 'hmmer',
+# 	-file   => $file
+# 	);
+# 	while( my $result = $in->next_result ) {
+# 		# this is a Bio::Search::Result::HMMERResult object
+# 		if (!defined $query){
+# 			$query = $result->query_name();
+# 			printOUT("query is $query\n");
+# 		}
+# 		my $hitcount = 0;
+# 		while( my $hit = $result->next_hit ) {
+# 			my $tmp = $hit->name();
+# 			my $tmpscore = $hit->score();
+# 			$tmp =~ s/_RF.*//;
+# 			if (!defined $tmphash{$tmp}){
+# 				$hits->[$hitcount]->{id} = $tmp;
+# 				$hits->[$hitcount]->{hmmscore} = $tmpscore;
+# 				$hitcount++;
+# 				$tmphash{$tmp}=1;
+# 				if (defined $bhh){
+# 					last;
+# 				}
+# 			}
+# 		}
+#
+# 		if (defined $hits->[0]) {
+# 			####### a quick hack to obtain the lagPhase value
+# 			my $criticalValue; # takes the value used for candidate discrimination
+# 			my $hitLimitLoc = $hitlimit;
+# 			if (defined $autoLimit) {
+# 				printDebug("Entering getLag Routine\n");
+# 				## the user has invoked the autmated inference of a hit limit
+# 				($hitLimitLoc, $criticalValue)  = getLag($hits, $hitcount);
+# 				if (!defined $criticalValue) {
+# 					## there was a problem in the computatation of the lagPhase
+# 					print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n";
+# 					($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
+# 				}
+# 			}
+# 			elsif (defined $scoreThreshold) {
+# 				printDebug("entering the scoreThreshold routine");
+# 				($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
+# 				printDebug("hitlimitloc is now $hitLimitLoc");
+# 			}
+#
+# 			return ($query, $hits, $hitLimitLoc, $criticalValue);
+# 		}
+# 		else {
+# 			return ($query);
+# 		}
+# 	}
+# }
 ##############################
 sub parseHmmer4pm {
 	my ($file, $path) = @_;
@@ -1931,9 +1932,9 @@ sub parseHmmer4pm {
 	$file = $path . '/' . $file;
 
 	$file =~ s/\|/\\\|/g;
-	my @hmmout = `$grepprog -v '#' $file |sort -rnk 9 |sed -e 's/ /@/g'`;
+	my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`;
 	for (my $i = 0; $i < @hmmout; $i++) {
-		($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession},  $hmmhits->[$i]->{total_evalue},  $hmmhits->[$i]->{total_score},  $hmmhits->[$i]->{total_bias},  $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score},  $hmmhits->[$i]->{domain_bias}, @rest) = split(/@+/, $hmmout[$i]);
+		($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession},  $hmmhits->[$i]->{total_evalue},  $hmmhits->[$i]->{total_score},  $hmmhits->[$i]->{total_bias},  $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score},  $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]);
 
 		if (!defined $query){
 			$query = $hmmhits->[$i]->{query_name};
diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 7e8a248..a99e1e6 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -127,13 +127,17 @@
 ## Modified 24. March 2021 v2.2.8 (Vinh)	- skip fa.mapping while checking genome_dir
 ## Modified 29. March 2021 v2.2.9 (Vinh)	- check for zero $maxAlnScore
 ##                                        - solved problem with long input path for fasta36 tools
+## Modified 23. April 2021 v2.3.0 (Vinh)	- parse fasta36 output for long IDs (longer than 60 chars)
+## Modified 31. May 2021 v2.3.1 (Vinh)	- added auto annotation for fdogFas
+## Modified 11. June 2021 v2.3.2 (Vinh)	- fixed --append option
+## Modified 16. June 2021 v2.4.0 (Vinh)	- add checkOff option
 
 ############ General settings
-my $version = 'oneSeq v.2.2.9';
+my $version = 'oneSeq v.2.4.0';
 ##### configure for checking if the setup.sh script already run
 my $configure = 0;
 if ($configure == 0){
-	die "\n\n$version\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n";
+	die "\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n";
 }
 ##### hostname
 my $hostname = `hostname`;
@@ -173,9 +177,9 @@
 my $outputfmt = 'blastxml';
 my $eval_blast_query = 0.0001;
 my $filter = 'F'; # default for blastp
-my $annotation_prog = "annoFAS";
-my $fas_prog = "calcFAS";
-my $fdogFAS_prog = "fdogFAS";
+my $annotation_prog = "fas.doAnno";
+my $fas_prog = "fas.run";
+my $fdogFAS_prog = "fas.runFdogFas";
 
 ##### ublast Baustelle: not implemented yet
 my $runublast = 0;
@@ -203,7 +207,6 @@
 my $idx_dir = "$path/taxonomy/";
 my $dataDir = $path . '/data';
 my $weightPath = "$path/weight_dir/";
-my $assembly_dir = "$path/assembly_dir/";
 
 my @defaultRanks = (
 	'superkingdom', 'kingdom',
@@ -260,6 +263,7 @@
 my $blastNode;
 my $representative;
 my $core_rep;
+my $checkOff;
 my $debug;
 my $corestrict;
 my $inputSeq = "";
@@ -307,15 +311,6 @@
 my %hashTree;
 my $aln = 'muscle';
 my $searchTaxa;
-#variables for fdog_goes_assembly
-my $assembly;
-my $augustusRefSpec;
-my $avIntron;
-my $lengthExtension;
-my $assemblyPath;
-my $searchTool = 'blast';
-my $matrix = 'blosum62';
-my $dataPath = '';
 ################# Command line options
 GetOptions (
 	"h"                 => \$help,
@@ -365,6 +360,7 @@
 	"blastpath=s"         => \$blastPath,
 	"searchpath=s"         => \$genome_dir,
 	"weightpath=s"         => \$weightPath,
+	"checkOff"             => \$checkOff,
 	"debug"             => \$debug,
 	"coreHitlimit=s"   => \$core_hitlimit,
 	"hitlimit=s"        => \$hitlimit,
@@ -377,15 +373,7 @@
 	"distDeviation=s"	=> \$distDeviation,
 	"aligner=s"	=> \$aln,
 	"hyperthread" => \$hyperthread,
-	"searchTaxa=s" => \$searchTaxa,
-	"assembly" => \$assembly,
-	"assemblypath=s" => \$assemblyPath,
-	"augustusRefSpec=s" => \$augustusRefSpec,
-	"avIntron=s" => \$avIntron,
-	"lengthExtension=s" => \$lengthExtension,
-	"searchTool=s" => \$searchTool,
-	"scoringmatrix=s" => \$matrix,
-	"dataPath=s" => \$dataPath
+	"searchTaxa=s" => \$searchTaxa
 );
 
 $outputPath = abs_path($outputPath);
@@ -397,17 +385,16 @@
 $weightPath = abs_path($weightPath)."/";
 $genome_dir = abs_path($genome_dir)."/";
 $taxaPath = $genome_dir;
-$dataPath = abs_path($dataPath)."/";
-$assembly_dir = abs_path($assemblyPath)."/";
 
 ############# do initial check
 if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) {
 	print "Validity checking....\n";
 	my $checkStTime = gettime();
-	initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff);
-	print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n";
+	unless($checkOff) {
+		initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff);
+	}
 
-	if (!defined $coreex && !defined $assembly) {
+	if (!defined $coreex) {
 		if (!grep(/$minDist/, @defaultRanks)) {
 			die "ERROR: minDist $minDist invalid!\n";
 		}
@@ -420,6 +407,7 @@
 			die "ERROR: coreOrth not defined (must be integer)!";
 		}
 	}
+	print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n";
 }
 
 ############# show version
@@ -490,7 +478,7 @@
 
 # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction)
 # get annotations for seed sequence if fas support is on
-if ($fas_support && !$assembly){
+if ($fas_support){
 	if (!$weightPath) {
 		createWeightFolder();
 	}
@@ -499,7 +487,7 @@
 
 my $coreStTime = gettime(); #time;
 #core-ortholog search
-if (!$coreex && !$assembly) {
+if (!$coreex) {
 	print "\nCore compiling...\n";
 	$coremode = 1;
 	$taxaPath = $blastPath;
@@ -637,12 +625,7 @@
 	my $final_eval_blast = $eval_blast*$eval_relaxfac;
 	my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac;
 
-	if (!$assembly){
-		$taxaPath = $genome_dir;
-	}
-	else{
-		$taxaPath = $assembly_dir;
-	}
+	$taxaPath = $genome_dir;
 	my @searchTaxa;
 	unless ($searchTaxa) {
 		unless($groupNode) {
@@ -698,72 +681,20 @@
 				}
 			}
 		}
-		if ($assembly){
-			$eval_blast = sprintf("%f", $eval_blast);
-			if ($seqFile ne "") {
-				my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent");
-
-				if (defined $assemblyPath){
-					push(@assembly_cmd, "--assemblyPath $assemblyPath")
-				}
-				if (defined $avIntron){
-					push(@assembly_cmd, "--avIntron $avIntron ");
-				}
-				if (defined $lengthExtension){
-					push(@assembly_cmd, "--lengthExtension $lengthExtension ");
-				}
-				if (!$autoclean){
-					push(@assembly_cmd, "--tmp ");
-				}
-				if ($outputPath){
-					push(@assembly_cmd, "--out $outputPath ");
-				}
-				if (defined $strict){
-					push(@assembly_cmd, "--strict");
-				}
-				if ($eval_blast){
-					push(@assembly_cmd, "--evalBlast $eval_blast ");
-				}
-				if ($searchTool){
-					push(@assembly_cmd, "--msaTool $aln ");
-				}
-				if (defined $checkcoorthologsref){
-					push(@assembly_cmd, "--checkCoorthologsRef");
-				}
-				if ($searchTool){
-					push(@assembly_cmd, "--searchTool $searchTool");
-				}
-				if ($matrix){
-					push(@assembly_cmd, "--scoringmatrix $matrix");
-				}
-				if ($coreOrthologsPath){
-					push(@assembly_cmd, "--coregroupPath $coreOrthologsPath");
-				}
-				if ($fasoff){
-					push(@assembly_cmd, "--fasoff");
-				}
-				if ($searchTaxon){
-					push(@assembly_cmd, "--searchTaxon $searchTaxon");
-				}
-				if ($filter){
-					push(@assembly_cmd, "--filter $filter");
-				}
-				printDebug(@assembly_cmd);
-				system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n";
-			}
-		}
-		else{
 		runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln);
-		}
 		$pm->finish;
 	}
 	$pm->wait_all_children;
 }
+### remove duplicated seq in extended.fa
+if (-e $finalOutput) {
+	addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
+}
 push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!";
 print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n";
 
-
-if(!$coreOnly && !$assembly){
+## Evaluation of all orthologs that are predicted by the final run
+if(!$coreOnly){
 	my $fasStTime = gettime();
 	my $processID = $$;
 
@@ -775,9 +706,9 @@
 	addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
 
 	# calculate FAS scores for final extended.fa
-	if ($fas_support && !$assembly) {
+	if ($fas_support) {
 		print "Starting the feature architecture similarity score computation...\n";
-		my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu";
+		my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno";
 		unless ($countercheck) {
 			$fdogFAScmd .= " --bidirectional"
 		}
@@ -788,21 +719,12 @@
 	}
 	push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
 	print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
-
 	if($autoclean){
 		print "Cleaning up...\n";
 		runAutoCleanUp($processID);
 	}
 }
 
-if ($assembly){
-	my $file_assembly_out;
-	$file_assembly_out = $outputPath . '/' . $seqName;
-	my $cmd_merge;
-	$cmd_merge = "fdog.mergeAssembly --in  $outputPath --out  $file_assembly_out --cleanup";
-	printDebug($cmd_merge);
-	system($cmd_merge);
-}
 ## Delete tmp folder
 unless ($debug) {
 	my $delTmp = "rm -rf $tmpdir";
@@ -814,7 +736,10 @@
 push @logOUT, "fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n";
 
 #### writing the log
-open (LOGOUT, ">$outputPath/fdog.log") or warn "Failed to open fdog.log for writing";
+open (LOGOUT, ">>$outputPath/fdog.log") or die "Could not open $outputPath/fdog.log for writing\n";
+print LOGOUT "\n\n";
+my $fdogVersion = `fdog.run --version`;
+print LOGOUT "fDOG v$fdogVersion\n";
 print LOGOUT join "\n", @logOUT;
 close LOGOUT;
 exit;
@@ -1209,10 +1134,10 @@ sub checkOptions {
 	if ($force == 1 and $append ==1) {
 		$force = 0;
 	}
-	### check the presence of the pre-computed core set if options reuseCore or assembly is used
-	if ($coreex || $assembly) {
+	### check the presence of the pre-computed core set
+	if ($coreex) {
 		if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") {
-			print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
+			print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
 			exit;
 		}
 	}
@@ -1283,7 +1208,7 @@ sub checkOptions {
 
 	### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected
 	$optbreaker = 0;
-	while(!$minCoreOrthologs and (!$coreex and !$assembly)) {
+	while(!$minCoreOrthologs and !$coreex) {
 		if ($optbreaker >= 3){
 			print "No proper number given ... exiting.\n";
 			exit;
@@ -1298,12 +1223,10 @@ sub checkOptions {
 		$filter = 'no' if $filter eq 'F';
 	}
 
-	if (!$assembly){
-		$inputSeq = fetchSequence($seqFile, $dataDir);
-	}
+	$inputSeq = fetchSequence($seqFile, $dataDir);
 
 	## the user has not provided a sequence id, however, the refspec is determined.
-	if($seqId eq '' && !$assembly) {
+	if($seqId eq '') {
 		my $besthit;
 		if (!$blast){
 			## a refspec has been determined
@@ -1318,9 +1241,9 @@ sub checkOptions {
 		$refSpec = $besthit->{species};
 		my $details = "Evalue: " . $besthit->{evalue};
 		printOut("Seq id has been determined as $seqId in $refSpec with $details", 2);
-		if(length("$seqName|$refSpec|$seqId") > 60) {
-			die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n";
-		}
+		# if(length("$seqName|$refSpec|$seqId") > 60) {
+		# 	die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n";
+		# }
 		if($seqId eq '') {
 			print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n";
 			exit;
@@ -1398,22 +1321,24 @@ sub checkOptions {
 			mkdir $outputPath or die "could not re-create the output directory $outputPath\n";
 		}
 		elsif ($append) {
-			printOut("Appending output to $finalOutput\n", 1);
-			if (-e "$outputPath/$seqName.extended.profile") {
+			if (-e "$outputPath/$seqName.extended.fa") {
 				## read in the content for latter appending
-				printOut("Appending output to $outputPath/$seqName.extended.profile", 1);
-				open (IN, "<$outputPath/$seqName.extended.profile") or die "failed to open $outputPath/$seqName.extended.profile after selection of option -append\n";
+				printOut("Appending output to $outputPath/$seqName.extended.fa", 1);
+				open (IN, "<$outputPath/$seqName.extended.fa") or die "failed to open $outputPath/$seqName.extended.fa after selection of option -append\n";
 				while (<IN>) {
-					chomp $_;
-					my @keys = split '\|', $_;
-					$profile{$keys[1]} = 1;
+					my $line = $_;
+					if ($line =~ /\|/) {
+						chomp $line;
+						my @keys = split '\|', $line;
+						$profile{$keys[1]} = 1;
+					}
 				}
 			}
 			elsif ($fasoff) {
 				## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing
 			}
 			else {
-				printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output");
+				printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output", 1);
 				exit;
 			}
 		}
@@ -1428,9 +1353,8 @@ sub checkOptions {
 	#### checking for the min and max distance for the core set compilation
 	#### omit this check, if the option reuseCore has been selected (added 2019-02-04)
 	$optbreaker = 0;
-	if (!$coreex and !$assembly) {
+	if (!$coreex) {
 		my $node;
-		#print "Testing coreex assembly\n";
 		$node = $db->get_taxon(-taxonid => $refTaxa{$refSpec});
 		$node->name('supplied', $refSpec);
 		if (lc($maxDist) eq "root"){
@@ -1790,8 +1714,9 @@ sub cumulativeAlnScore{
 			my $line = $_;
 			$line =~ s/[\(\)]//g;
 			my @line = split('\s+',$line);
-
-			if($line[0] && ($line[0] eq $key)){
+			my $shortedId = substr($key, 0, 60);
+			# if($line[0] && ($line[0] eq $key)){
+			if($line[0] && ($line[0] eq $shortedId)){
 				if(exists $cumscores{$key}) {
 					$gotScore = 1;
 					$cumscores{$key} = $cumscores{$key} + $line[2];
@@ -2146,7 +2071,7 @@ sub addSeedSeq {
 	# get seed sequence and add it to the beginning of the fasta output
 	open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n";
 	my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta');
-	my %idTmp; # used to check which seq has already been written to output
+	my %idTmp = (); # used to check which seq has already been written to output
 	while(my $seq = $seqio->next_seq) {
 		my $id = $seq->id;
 		if ($id =~ /$refSpec/) {
@@ -2162,6 +2087,7 @@ sub addSeedSeq {
 		unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) {
 			unless ($idTmp{$id}) {
 				print TEMP ">$id\n", $seq->seq, "\n";
+				$idTmp{$id} = 1;
 			}
 		}
 	}
@@ -2643,9 +2569,9 @@ sub initialCheck {
 	}
 
 	# check executable FAS
-	my $fasCheckMsg = `setupFAS -t ./ -c 2>&1`;
+	my $fasCheckMsg = `fas.setup -t ./ -c 2>&1`;
 	if ($fasoff != 1 && $fasCheckMsg =~ /ERROR/) {
-		die "ERROR: greedyFAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/prepareFAS\n";
+		die "ERROR: FAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/setup\n";
 	}
 
 	# check seed fasta file
@@ -2690,9 +2616,19 @@ sub initialCheck {
 		}
 	}
 	# check weight_dir
-	if ($fasoff != 1 && !$assembly) {
+	if ($fasoff != 1) {
 		my %seen;
 		my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir);
+		my @notFolder;
+		for (my $i = 0;$i < scalar(@allTaxa); $i++){
+			if (-f "$blastDir/$allTaxa[$i]" || -f "$genomeDir/$allTaxa[$i]") {
+				push(@notFolder, $allTaxa[$i]);
+				splice(@allTaxa, $i, 1);
+			}
+		}
+		if (scalar(@notFolder) > 0) {
+			print "*** WARNING: Found files in $genomeDir or $blastDir:\t@notFolder\n";
+		}
 		chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`);
 		my @allAnno = split(/\n/, $allAnno);
 		my @missingAnno = array_minus(@allTaxa, @allAnno);
diff --git a/fdog/checkData.py b/fdog/checkData.py
index 84310ac..3aafe44 100644
--- a/fdog/checkData.py
+++ b/fdog/checkData.py
@@ -133,28 +133,29 @@ def checkDataFolder(checkDir, replace, delete, concat):
                     if os.path.islink(faFile):
                         faFile = os.path.realpath(faFile)
                     checkFileExist(faFile)
-                    if not '.checked' in faFile:
-                        if not os.path.exists(faFile+".checked"):
-                            checkFaFile = checkValidFasta(faFile)
-                            if checkFaFile == 'notFasta':
-                                sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile)
-                            elif checkFaFile == 'longHeader':
-                                sys.exit('*** ERROR: %s contains long headers!' % faFile)
-                            elif checkFaFile == 'space':
-                                sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile)
-                            elif checkFaFile == 'multiLine':
-                                if not concat:
-                                    print('*** ERROR: %s contains multiple-line sequences!' % faFile)
-                                    sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines')
-                                else:
-                                    rewriteSeqs(faFile, replace, delete)
-                            elif checkFaFile == 'ok':
-                                if not (delete or replace):
-                                    checkValidSeqs(faFile)
-                                else:
-                                    rewriteSeqs(faFile, replace, delete)
-                            writeCheckedFile(faFile)
-                            print(fd)
+                    if not '.mapping' in faFile:
+                        if not '.checked' in faFile:
+                            if not os.path.exists(faFile+".checked"):
+                                checkFaFile = checkValidFasta(faFile)
+                                if checkFaFile == 'notFasta':
+                                    sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile)
+                                elif checkFaFile == 'longHeader':
+                                    sys.exit('*** ERROR: %s contains long headers!' % faFile)
+                                elif checkFaFile == 'space':
+                                    sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile)
+                                elif checkFaFile == 'multiLine':
+                                    if not concat:
+                                        print('*** ERROR: %s contains multiple-line sequences!' % faFile)
+                                        sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines')
+                                    else:
+                                        rewriteSeqs(faFile, replace, delete)
+                                elif checkFaFile == 'ok':
+                                    if not (delete or replace):
+                                        checkValidSeqs(faFile)
+                                    else:
+                                        rewriteSeqs(faFile, replace, delete)
+                                writeCheckedFile(faFile)
+                                print(fd)
                 taxaList.append(fd)
             except subprocess.CalledProcessError as e:
                 print('*** ERROR: Problem while searching for fasta file')
@@ -162,13 +163,28 @@ def checkDataFolder(checkDir, replace, delete, concat):
                 sys.exit()
     return(taxaList)
 
-def checkCompleteAnno(weightDir, taxaList):
+def checkMissingJson(weightDir, taxaList):
     allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))]
     taxaAnno = [s + '.json' for s in taxaList]
     s = set(allAnno)
     missingAnno = [x for x in taxaAnno if x not in s]
     return(missingAnno)
 
+def checkCompleteAnno(weightDir, genomeDir):
+    allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))]
+    for f in allAnno:
+        tax = f.replace('.json', '')
+        print('...check annotations for %s' % tax)
+        jf = '%s/%s.json' % (weightDir, tax)
+        gf = '%s/%s/%s.fa' % (genomeDir, tax, tax)
+        cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, weightDir)
+        try:
+            subprocess.call([cmd], shell = True)
+        except subprocess.CalledProcessError as e:
+            print('*** ERROR: Problem while checking annotation file using fas.checkAnno!')
+            print(e.output.decode(sys.stdout.encoding))
+            sys.exit()
+
 def checkMissingNcbiID(namesDmp, taxaList):
     ncbiId = {}
     with open(namesDmp, 'r') as f:
@@ -193,7 +209,7 @@ def checkMissingNcbiID(namesDmp, taxaList):
     return(missingTaxa.keys(), dupTaxa)
 
 def main():
-    version = '0.0.3'
+    version = '0.0.6'
     parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.')
     parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='')
     parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='')
@@ -237,12 +253,13 @@ def main():
 
     ### check weightDir
     print('=> Checking %s...' % weightDir)
-    missingAnno = checkCompleteAnno(weightDir, join2Lists(genomeTaxa, blastTaxa))
+    missingAnno = checkMissingJson(weightDir, join2Lists(genomeTaxa, blastTaxa))
     if len(missingAnno) > 0:
-        print('\033[92m*** WARNING: Annotations not found for:\033[0m')
+        print('\033[92m*** WARNING: Annotation files not found for:\033[0m')
         print(*missingAnno, sep = "\n")
         print('NOTE: You still can run fdog without FAS using the option "-fasoff"')
         caution = 1
+    checkCompleteAnno(weightDir, genomeDir)
 
     ### check ncbi IDs
     print('=> Checking NCBI taxonomy IDs...')
diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 46f83c0..12fcf6f 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 
 #######################################################################
-# Copyright (C) 2020 Hannah Muelbaier
+
+# Copyright (C) 2021 Hannah Muelbaier
 #
 #  This script is used to run fDOG-Assembly which performs targeted ortholog
 #  searches on genome assemblies
@@ -27,8 +28,30 @@
 import yaml
 import subprocess
 import time
-=======
+import shutil
+import multiprocessing as mp
+
 ########################### functions ##########################################
+def check_path(path):
+    if not os.path.exists(path):
+        print(path + " does not exist. Exciting ...")
+        sys.exit()
+
+def check_ref_sepc(species_list, fasta_file):
+    file = open(fasta_file, "r")
+    lines = file.readlines()
+    species_file = []
+
+    for line in lines:
+        if line[0] == ">":
+            species = line.split("|")[1]
+            species_file.append(species)
+    for species in species_list:
+        if species in species_file:
+            return species
+    print("Reference species is not part of the ortholog group. Exciting ...")
+    sys.exit()
+
 def load_config(config_file):
     with open(config_file, 'r') as stream:
         try:
@@ -36,23 +59,27 @@ def load_config(config_file):
         except yaml.YAMLError as exc:
             print(exc)
 
-def starting_subprocess(cmd, mode):
-    if mode == 'debug':
-        result = subprocess.run(cmd, shell=True)
-    elif mode == 'silent':
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True)
-    elif mode == 'normal':
-        result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
+def starting_subprocess(cmd, mode, time_out = None):
+
+    try:
+        if mode == 'debug':
+            result = subprocess.run(cmd, shell=True, timeout = time_out)
+        elif mode == 'silent':
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out)
+        elif mode == 'normal':
+            result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out)
+    except subprocess.TimeoutExpired:
+        return 1
 
 def merge(blast_results, insert_length):
     #merging overlapping and contigous candidate regions
+    #format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>, <score>)]}
     number_regions = 0
     insert_length = int(insert_length)
+    score_list = []
     for key in blast_results:
         locations = blast_results[key]
         locations = sorted(locations, key = lambda x: int(x[3]))
-        #print("test")
-        #print(locations)
         size_list = len(locations)
         j = 0
         while j < size_list-1:
@@ -62,6 +89,8 @@ def merge(blast_results, insert_length):
                     #merge overlapping regions plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
@@ -69,6 +98,8 @@ def merge(blast_results, insert_length):
                     #merge overlapping regions minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -= 1
@@ -76,6 +107,8 @@ def merge(blast_results, insert_length):
                     #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand
                     locations[j][1] = max(locations[j][1], locations[i][1])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
@@ -83,20 +116,24 @@ def merge(blast_results, insert_length):
                     #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand
                     locations[j][0] = min(locations[j][0], locations[i][0])
                     locations[j][2] = min(locations[j][2], locations[i][2])
+                    locations[j][4] = max(locations[j][4], locations[i][4])
+                    locations[j][6] = max(locations[j][6], locations[i][6])
                     locations.pop(i)
                     size_list -= 1
                     i -=1
                 i += 1
             j += 1
 
+        for entry in locations:
+            score_list.append(entry[6])
         number_regions += len(locations)
         blast_results[key] = locations
 
-    return blast_results, number_regions
+    return blast_results, number_regions, score_list
 
 def parse_blast(line, blast_results, cutoff):
-    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend>
-    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>)]}
+    # format blast line:  <contig> <sstart> <send> <evalue> <qstart> <qend> <score>
+    # format dictionary: {node_name: [(<start>,<send>,evalue, <qstart>,<qend>,<strand>, <score>)]}
     line = line.replace("\n", "")
     line_info = line.split("\t")
     evalue = float(line_info[3])
@@ -105,7 +142,7 @@ def parse_blast(line, blast_results, cutoff):
         return blast_results, evalue
     #add region to dictionary
     else:
-        node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5])
+        node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6])
         split = node_name.split("|")
         # finding out on which strand tBLASTn found a hit
         if sstart < send:
@@ -119,14 +156,32 @@ def parse_blast(line, blast_results, cutoff):
             node_name = split[1]
         if node_name in blast_results:
             list = blast_results[node_name]
-            list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand])
+            list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score])
             blast_results[node_name] = list
         else:
-            blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]]
+            blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]]
 
     return blast_results, evalue
 
-def candidate_regions(intron_length, cutoff_evalue, tmp_path):
+def get_x_results(blast_dic, x, score_list):
+
+    new_dic = {}
+    score_list.sort(reverse=True)
+    min = score_list[x - 1]
+    number_regions = 0
+
+    for key in blast_dic:
+        key_list = []
+        entries = blast_dic[key]
+        for i in entries:
+            if i[6] >= min:
+                key_list.append(i)
+        if key_list != []:
+            new_dic[key] = key_list
+            number_regions += len(key_list)
+    return new_dic, number_regions
+
+def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10):
     ###################### extracting candidate regions ########################
     # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
     blast_file = open(tmp_path + "/blast_results.out", "r")
@@ -142,10 +197,13 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path):
         blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
 
     if blast_results == {}:
+        blast_file.close()
         return 0,0
     else:
-        candidate_regions, number_regions = merge(blast_results, intron_length)
-
+        candidate_regions, number_regions, score_list = merge(blast_results, intron_length)
+        blast_file.close()
+        if number_regions > x:
+            candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list)
         return candidate_regions, number_regions
 
 def extract_seq(region_dic, path, tmp_path, mode):
@@ -187,7 +245,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                         output.write(line)
                 sequence_file.close()
             except FileNotFoundError:
-                print("No gene found in region with ID:" + name + " , continuing with next region")
+                print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
     output.close()
 
 def searching_for_db(assembly_path):
@@ -250,11 +308,20 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
     if msaTool == "muscle":
         os.system("muscle -quiet -in " + output_file + " -out " + aln_file)
         #print("muscle -quiet -in " + output_file + " -out " + aln_file)
+        if not os.path.exists(aln_file):
+            print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.")
+            os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
+
     elif msaTool == "mafft-linsi":
         #print("mafft-linsi")
         os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
 
-    distances = get_distance_biopython(aln_file, matrix)
+    try:
+        distances = get_distance_biopython(aln_file, matrix)
+    except ValueError:
+        print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
+        return 0, "NaN", "NaN"
+
 
     distance_hit_query = distances[best_hit, candidate_name]
     distance_ref_hit = distances[best_hit, ref]
@@ -280,7 +347,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
         try:
             id_ref = seedDic[fdog_ref_species]
         except KeyError:
-            print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
+            #print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
             return 0, seed
         if searchTool == "blast":
             cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
@@ -298,45 +365,46 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
             id, gene, evalue = (line.replace("\n", "")).split("\t")
             gene_name = gene.split("|")[2]
             if gene_name != old_name:
-                print("candidate:%s"%(gene_name))
-                print("blast-hit:%s"%(id))
+                print("candidate:%s"%(gene_name)) if mode == "debug" else ""
+                print("blast-hit:%s"%(id)) if mode == "debug" else ""
                 min = float(evalue)
                 if id in id_ref:
                     orthologs.append(gene)
-                    print("\thitting\n")
+                    print("\thitting\n") if mode == "debug" else ""
                 else:
                     if checkCo == True:
                         for i in id_ref:
-                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
                             co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
                             if co_orthologs_result == 1:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                                 orthologs.append(gene)
                             elif co_orthologs_result == 0:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                                if distance_ref_hit != "NaN":
+                                    print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                     else:
-                        print("\tnothitting\n")
+                        print("\tnothitting\n") if mode == "debug" else ""
             elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs:
                 if id in id_ref:
                     orthologs.append(gene)
-                    print("\thitting\n")
+                    print("\thitting\n") if mode == "debug" else ""
                 else:
                     if checkCo == True:
                         for i in id_ref:
-                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+                            print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
                             co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
                             if co_orthologs_result == 1:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                                 orthologs.append(gene)
                             elif co_orthologs_result == 0:
-                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+                                print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
                     else:
-                        print("\tnot hitting\n")
+                        print("\tnot hitting\n") if mode == "debug" else ""
             old_name = gene_name
 
 
         if orthologs == []:
-            print("No hit in the backward search, ...exciting")
+            #print("No hit in the backward search, ...exciting")
             return 0, seed
 
     else:
@@ -361,12 +429,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
         orthologs = set({})
 
         for species in seed:
-            print("backward search in species " + species + "\n")
+            print("backward search in species %s\n" %species)
             orthologs_new = set({})
             try:
                 id_ref = seedDic[species]
             except KeyError:
-                print("The species " + species + " isn't part of the core ortholog group, ... exciting")
+                #print("The species " + species + " isn't part of the core ortholog group, ... exciting")
                 return 0, seed
 
             cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
@@ -389,12 +457,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
 
             #print(species)
             #print(orthologs_new)
+            #print(orthologs)
             if species == fdog_ref_species:
                 orthologs = orthologs_new
             else:
                 orthologs = orthologs & orthologs_new
-                if orthologs == {}:
-                    print("No ortholog was found with option --strict")
+                if len(orthologs) == 0:
+                    #print("No ortholog was found with option --strict")
                     return 0, seed
 
 
@@ -403,6 +472,39 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
     orthologs = set(orthologs)
     return list(orthologs), seed
 
+def addRef(output, core_fasta, species_list):
+    #print(species_list)
+    output_file = open(output, "a+")
+    seq_records_core = readFasta(core_fasta)
+    seq_records_core = list(seq_records_core)
+    for species in species_list:
+        for entry_core in seq_records_core:
+            if species in entry_core.id:
+                output_file.write(">" + entry_core.id + "\n")
+                output_file.write(str(entry_core.seq) + "\n")
+    output_file.close()
+
+def addSeq(output, seq_list):
+    output_file = open(output, "a+")
+
+    for item in seq_list:
+        #print(item)
+        candidate_fasta = item[1]
+        sequenceIds = item[0]
+        if sequenceIds == 0 or sequenceIds == []:
+            continue
+        seq_records_candidate = readFasta(candidate_fasta)
+        seq_records_candidate = list(seq_records_candidate)
+        for entry_candidate in seq_records_candidate:
+            if entry_candidate.id in sequenceIds:
+                if entry_candidate.id == sequenceIds[0]:
+                    output_file.write(">" + entry_candidate.id + "|1" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+                else:
+                    output_file.write(">" + entry_candidate.id + "|0" + "\n")
+                    output_file.write(str(entry_candidate.seq) + "\n")
+    output_file.close()
+
 def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
 
     output_file = open(output, "a+")
@@ -441,12 +543,18 @@ def createFasInput(orthologsOutFile, mappingFile):
         ncbi_id = (seq.id.split("@"))[1]
         mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n")
 
-
+    mappingFile.close()
     return fas_seed_id
 
 def cleanup(tmp, tmp_path):
     if tmp == False:
-        os.system('rm -r ' + tmp_path)
+        timeout = time.time() + 60*1
+        while os.path.exists(tmp_path):
+            shutil.rmtree(tmp_path, ignore_errors=True)
+            if time.time() > timeout:
+                print("tmp folder could not be removed!")
+                break
+
 
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
@@ -517,6 +625,80 @@ def clean_fas(path, file_type):
             new_line = id + "\t" + remain
 
         file.write(new_line)
+    file.close()
+
+def ortholog_search(args):
+    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args
+    cmd = 'mkdir ' + out + '/tmp/' + asName
+    starting_subprocess(cmd, 'silent')
+    tmp_path = out + "tmp/" + asName + "/"
+    candidatesOutFile = tmp_path + group + ".candidates.fa"
+    #orthologsOutFile = out + "/" + group + ".extended.fa"
+    fasOutFile = out + "/" + group
+    #mappingFile = out + "/tmp/" + group + ".mapping.txt"
+
+    sys.stdout.write("Searching in species " + asName + "\n")
+    assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
+    db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
+    db_check = searching_for_db(db_path)
+
+    if db_check == 0:
+        #print("Creating a blast data base...")
+        cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+        starting_subprocess(cmd, mode)
+        #print("\t ...finished \n")
+
+    #makes a tBLASTn search against database
+    #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+    #print("Starting tBLASTn search...")
+    cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+    time_tblastn_start = time.time()
+    exit_code = starting_subprocess(cmd, mode, 3600)
+    time_tblastn_end = time.time()
+    time_tblastn = time_tblastn_end - time_tblastn_start
+    if exit_code == 1:
+        sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
+        #cleanup(tmp, tmp_folder)
+        #sys.exit()
+        return [], candidatesOutFile
+    #else:
+        #print("\t ...finished")
+    print("Time tblastn %s in species %s" % (str(time_tblastn), asName))
+
+    regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+    if regions == 0:
+        #no candidat region are available, no ortholog can be found
+        sys.stdout.write("No candidate region found for species %s!\n" % asName)
+        return [], candidatesOutFile
+
+    else:
+        print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
+        extract_seq(regions, db_path, tmp_path, mode)
+
+    ############### make Augustus PPX search ###################################
+    #print("Starting augustus ppx ...")
+    time_augustus_start = time.time()
+    augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+    #print("\t ...finished \n")
+    time_augustus_end = time.time()
+    time_augustus = time_augustus_end - time_augustus_start
+    print("Time augustus: %s species %s \n" % (str(time_augustus), asName))
+
+    ################# backward search to filter for orthologs###################
+    if int(os.path.getsize(candidatesOutFile)) <= 0:
+        #print("No genes found at candidate regions\n")
+        return [], candidatesOutFile
+
+    reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+
+    if reciprocal_sequences == 0:
+        if regions != 0:
+            sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+        return [], candidatesOutFile
+    else:
+        reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+
+    return reciprocal_sequences, candidatesOutFile
 
 class Logger(object):
     def __init__(self, file):
@@ -534,24 +716,22 @@ def flush(self):
 
 def main():
 
-    #################### handle user input ########################################
+    #################### handle user input #####################################
 
     start = time.time()
-
-    version = '0.1.1'
-
-
+    version = '0.1.2'
+    ################### initialize parser ######################################
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
-
+    ################## required arguments ######################################
     required = parser.add_argument_group('Required arguments')
     required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
                             action='store', default='', required=True)
     required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
-    required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True)
-
+    required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True)
+    ################## optional arguments ######################################
     optional = parser.add_argument_group('Optional arguments')
-    optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
+    optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int)
     optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int)
     optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='')
     optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False)
@@ -564,15 +744,16 @@ def main():
     optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False)
     optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
-    optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='')
-    optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
+    optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[])
+    #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
     optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
     optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
-    optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
+    optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[])
     optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
     optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
-
-
+    optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
+    optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
+    optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False)
     args = parser.parse_args()
 
     # required
@@ -602,15 +783,15 @@ def main():
     msaTool = args.msaTool
     matrix = args.scoringmatrix
     taxa = args.coreTaxa
-    if taxa == '':
-        taxa =[]
-    else:
-        taxa = taxa.split(",")
     fasoff = args.fasoff
-    searchTaxon = args.searchTaxon
+    searchTaxa = args.searchTaxa
     silent = args.silent
     debug = args.debug
+    force = args.force
+    append = args.append
+    parallel = args.parallel
 
+    # output modes
     if debug == True and silent == True:
         print("It's not possible to use booth modes, please restart and use --debug or --silent")
         return 1
@@ -637,23 +818,43 @@ def main():
                 dataPath = cfg['dataPath']
             except:
                 dataPath = 'config'
+
+    if out == '':
+        out = os.getcwd()
+    else:
+        if out[-1] != "/":
+            out = out + "/"
+        check_path(out)
+
+    if os.path.exists(out + '/' + group):
+        if append != True and force != True:
+            print("Output folder for group " + group + " exists already. Please choose --force or --append.")
+            sys.exit()
+        elif force == True:
+            shutil.rmtree(out + '/' + group, ignore_errors=True)
+            refBool = False
+            os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+            out = out + '/' + group + '/'
+        elif append == True:
+            out = out + '/' + group + '/'
+            refBool = True
+        else:
+            refBool = False # checks if sequences of reference species were already part of the extended.fa file
+    else:
+        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+        out = out + '/' + group + '/'
+        refBool = False
+
     if core_path == '':
         core_path = out + '/core_orthologs/'
     else:
         if not core_path.endswith('/'):
             core_path = core_path + '/'
+        check_path(core_path)
 
     if assemblyDir == '':
         assemblyDir = dataPath + '/assembly_dir/'
-    if out == '':
-        #print('test out \n')
-        out = os.getcwd()
-        os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
-        out = out + '/' + group + '/'
-    else:
-        if out[-1] != "/":
-            out = out + "/"
-
+    check_path(assemblyDir)
 
     try:
         f = open(out + "/fdog.log", "a+")
@@ -668,194 +869,130 @@ def main():
     else:
         sys.stdout = Logger(f)
 
-    # user input has to be checked here before fDOGassembly continues
-    assembly_names = os.listdir(assemblyDir)
-
-    ########################## some variables ##################################
-
-    refBool = False # checks if sequences of reference species were already part of the extended.fa file
+    ########################### other variables ################################
+    if searchTaxa == []:
+        assembly_names = os.listdir(assemblyDir)
+    else:
+        assembly_names = os.listdir(assemblyDir)
+        for Taxon in searchTaxa:
+            if Taxon not in assembly_names:
+                print("Taxon %s is not in the assembly_dir" % Taxon)
+                sys.exit()
+        assembly_names = searchTaxa
 
-    ########### paths ###########
+    ################################# paths ####################################
 
     msa_path = core_path + "/" + group +"/"+ group + ".aln"
+    check_path(msa_path)
     hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
+    check_path(hmm_path)
     fasta_path = core_path + "/" + group +"/"+ group + ".fa"
+    check_path(fasta_path)
     consensus_path = out + "/tmp/" + group + ".con"
     profile_path = out + "/tmp/" + group + ".prfl"
+    tmp_folder = out + "/tmp"
+
+    ########### is/are fDOG reference species part of ortholog group? ##########
+
+    fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path)
 
     ###################### create tmp folder ###################################
 
     cmd = 'mkdir ' + out + '/tmp'
     starting_subprocess(cmd, 'silent')
 
-    ######################## consensus sequence ################################
+    print("Gene: " + group)
+    print("fDOG reference species: " + fdog_ref_species + " \n")
 
+    ######################## consensus sequence ################################
+    group_computation_time_start = time.time()
     #make a majority-rule consensus sequence with the tool hmmemit from hmmer
-    print("Building a consensus sequence for gene " + group + " \n")
+    print("Building a consensus sequence")
     cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
     starting_subprocess(cmd, mode)
-    print("consensus sequence is finished\n")
+    print("\t ...finished\n")
 
     ######################## block profile #####################################
 
-    print("Building a block profile for gene " + group + " \n")
+    print("Building a block profile ...")
     cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
     starting_subprocess(cmd, 'silent')
 
     if int(os.path.getsize(profile_path)) > 0:
-        print("block profile is finished \n")
+        print("\t ...finished \n")
     else:
         print("Building block profiles failed. Using prepareAlign to convert alignment\n")
         new_path = core_path + group +"/"+ group + "_new.aln"
-        #print(cmd)
         cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
         starting_subprocess(cmd, mode)
         cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
-        #print(cmd)
         starting_subprocess(cmd, 'silent')
-        print("block profile is finished \n")
-
-    searchBool = False
-
-    #################### fDOG assembly computation for all species #############
-    for asName in assembly_names:
-        if searchBool == True:
-            break
-        if searchTaxon != '' and searchBool == False:
-            asName = searchTaxon
-            searchBool = True
-
-        ################### path definitions ###################################
-
-        cmd = 'mkdir ' + out + '/tmp/' + asName
-        starting_subprocess(cmd, 'silent')
-        tmp_path = out + "/tmp/" + asName + "/"
-        candidatesOutFile = tmp_path + group + ".candidates.fa"
-        if searchTaxon != '':
-            orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
-            fasOutFile = out + "/" + group + "_" + asName
-            mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
-        else:
-            orthologsOutFile = out + "/" + group + ".extended.fa"
-            fasOutFile = out + "/" + group
-            mappingFile = out + "/tmp/" + group + ".mapping.txt"
-
-        print("Searching in species " + asName + "\n")
-        assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
-        db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-
-    ######################## tBLASTn ###########################################
-        #checks if data base exists already
-        db_check = searching_for_db(db_path)
-        if db_check == 0:
-            print("creating a blast data base \n")
-            cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
-            starting_subprocess(cmd, mode)
-            print("database is finished \n")
-        else:
-            print('blast data base exists already, continuing...')
-
-        #makes a tBLASTn search against the new database
-        #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-        print("tBLASTn search against data base")
-        cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
-        starting_subprocess(cmd, mode)
-        print("tBLASTn search is finished")
-
-    ################### search for candidate regions and extract seq ###########
-    # parse blast and filter for candiate regions
-        regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
-
-        if regions == 0:
-            #no candidat region are available, no ortholog can be found
-            print("No candidate region found")
-            if refBool == True:
-                continue
-            else:
-                taxa = [fdog_ref_species]
-                reciprocal_sequences = 0
-        else:
-            print(str(number_regions) + " candiate regions were found. Extracting sequences...")
-            extract_seq(regions, db_path, tmp_path, mode)
-
-    ############### make Augustus PPX search ###################################
-
-            print("starting augustus ppx \n")
-            augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-            print("augustus is finished \n")
-
-    ################# backward search to filter for orthologs###################
-            if int(os.path.getsize(candidatesOutFile)) <= 0:
-                print("No genes found at candidate regions\n")
-                if searchTaxon == '' and refBool == True:
-                    continue
-                else:
-                    reciprocal_sequences = 0
-                    taxa = [fdog_ref_species]
-            else:
-                reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
-
-
-    ################## checking accepted genes for co-orthologs ################
-        if reciprocal_sequences == 0:
-            if regions != 0:
-                print("No ortholog fulfilled the reciprocity criteria")
-            if searchTaxon == '' and refBool == True:
-                continue
-            else:
-                reciprocal_sequences = 0
-        else:
-            reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
-
-    ################ add sequences to extended.fa in the output folder##########
-
-        addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
-        refBool = True
-
-    ############### make Annotation with FAS ###################################
-        # if we want to search in only one Taxon
-        if searchTaxon != '' and fasoff == False:
-            fas = time.time()
-            print("Calculating FAS scores")
-            fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
-            # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-            cmd = 'mkdir ' + tmp_path + 'anno_dir'
-            starting_subprocess(cmd, 'silent')
-            cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName
-            starting_subprocess(cmd, 'silent')
-            clean_fas(fasOutFile + "_forward.domains", 'domains')
-            clean_fas(fasOutFile + "_reverse.domains", 'domains')
-            clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile')
-
-
-    #if we searched in more than one Taxon and no ortholog was found
+        print(" \t ...finished \n")
+
+    group_computation_time_end = time.time()
+    time_group = group_computation_time_end - group_computation_time_start
+
+    ###################### ortholog search #####################################
+
+    ortholog_sequences = []
+    time_ortholog_start = time.time()
+    if parallel == True:
+        ##################### parallel compuataion #############################
+        calls = []
+        cpus = mp.cpu_count()
+        pool = mp.Pool(cpus)
+        for asName in assembly_names:
+            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
+
+        results = (pool.imap_unordered(ortholog_search, calls))
+        pool.close()
+        pool.join()
+        for i in results:
+            ortholog_sequences.append(i)
+    else:
+        ###################### computation species per species ################
+        for asName in assembly_names:
+            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
+            reciprocal_sequences, candidatesOutFile = ortholog_search(args)
+            ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
+
+    ################## preparing output ########################################
+    orthologsOutFile = out + "/" + group + ".extended.fa"
+    time_ortholog_end = time.time()
+    time_ortholog = time_ortholog_end - time_ortholog_start
+    if taxa == []:
+        taxa = [fdog_ref_species]
+    if append == True:
+        addSeq(orthologsOutFile, ortholog_sequences)
+    else:
+        addRef(orthologsOutFile, fasta_path, taxa)
+        addSeq(orthologsOutFile, ortholog_sequences)
+    mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-    if refBool == False and searchTaxon == '':
-        print("No orthologs found. Exciting ...")
-        cleanup(tmp, tmp_path)
-        return 1
-    #if we searched in more than one taxon
-    if fasoff == False and searchTaxon == '':
+    if fasoff == False:
         fas = time.time()
-        print("Calculating FAS scores")
+        print("Calculating FAS scores ...")
+
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
-        # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
-        cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+        cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
         starting_subprocess(cmd, 'silent')
         clean_fas(out + group + "_forward.domains", 'domains')
         clean_fas(out + group + "_reverse.domains", 'domains')
         clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+        print("\t ...finished \n")
     ################# remove tmp folder ########################################
-    if searchTaxon != '':
-        cleanup(tmp, tmp_path)
-    else:
-        cleanup(tmp, out + "/tmp/")
+    end = time.time()
+    time_fas = end - fas
+    print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
+    print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
+    sys.stdout = sys.__stdout__
 
     end = time.time()
     sys.stdout = sys.__stdout__
     #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
     f.close()
+    cleanup(tmp, tmp_folder)
 
 if __name__ == '__main__':
     main()
diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
deleted file mode 100644
index 1606b1d..0000000
--- a/fdog/mergeAssemblyOutput.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# -*- coding: utf-8 -*-
-
-#######################################################################
-# Copyright (C) 2020 Vinh Tran
-#
-#  This script is used to merge all output files (.extended.fa, .phyloprofile,
-#  _forward.domains, _reverse.domains) in a given directory into one file each.
-#
-#  This script is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License <http://www.gnu.org/licenses/> for
-#  more details
-#
-#  Contact: hannah.muelbaier@stud.uni-frankfurt.de
-#
-#######################################################################
-
-import sys
-import os
-from os import listdir as ldir
-import argparse
-from pathlib import Path
-
-def main():
-    version = '0.0.1'
-    parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.')
-    parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
-                        action='store', default='', required=True)
-    parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True)
-    parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False)
-    args = parser.parse_args()
-
-    directory = args.input
-    out = args.output
-    cleanup = args.cleanup
-    if not os.path.exists(os.path.abspath(directory)):
-        sys.exit('%s not found' % directory)
-    else:
-        directory = os.path.abspath(directory)
-
-    phyloprofile = None
-    set_phylo = set()
-    domains_0 = None
-    set_domains_f = set()
-    domains_1 = None
-    set_domains_r = set()
-    ex_fasta = None
-    set_fasta = set()
-    header_bool = False
-    for infile in ldir(directory):
-        if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile':
-            if not phyloprofile:
-                phyloprofile = open(out + '.phyloprofile', 'w')
-                phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo:
-                        phyloprofile.write(line)
-                if len(lines) > 1:
-                    set_phylo = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains':
-            if not domains_0:
-                domains_0 = open(out + '_forward.domains', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line not in set_domains_f:
-                        domains_0.write(line)
-                if len(lines) > 1:
-                    set_domains_f = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains':
-            if not domains_1:
-                domains_1 = open(out + '_reverse.domains', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                for line in lines:
-                    if line not in set_domains_r:
-                        domains_1.write(line)
-                if len(lines) > 1:
-                    set_domains_r = set(lines)
-            if cleanup == True:
-                os.remove(directory + '/' + infile)
-        elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa':
-            if not ex_fasta:
-                ex_fasta = open(out + '.extended.fa', 'w')
-            with open(directory + '/' + infile, 'r') as reader:
-                lines = reader.readlines()
-                header = set()
-                #print(set_fasta)
-                for line in lines:
-                    if line[0] == ">":
-                        header.add(line)
-                        if line not in set_fasta:
-                            ex_fasta.write(line)
-                            header_bool = True
-                        else:
-                            header_bool = False
-                    else:
-                        if header_bool == True:
-                            ex_fasta.write(line)
-                set_fasta = header
-            if cleanup == True:
-                os.remove(directory + '/' +infile)
-        elif infile.endswith('.tsv'):
-            os.remove(directory + '/' + infile)
-
-    if phyloprofile:
-        phyloprofile.close()
-    if domains_0:
-        domains_0.close()
-    if domains_1:
-        domains_1.close()
-    if ex_fasta:
-        ex_fasta.close()
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py
index 2628280..a6c13c2 100644
--- a/fdog/mergeOutput.py
+++ b/fdog/mergeOutput.py
@@ -20,14 +20,15 @@
 import os
 from os import listdir as ldir
 import argparse
-from pathlib import Path
+
 
 def main():
     version = '0.0.1'
     parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.')
-    parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
+    parser.add_argument('-i', '--input',
+                        help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
                         action='store', default='', required=True)
-    parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True)
+    parser.add_argument('-o', '--output', help='Output name', action='store', default='', required=True)
     args = parser.parse_args()
 
     directory = args.input
diff --git a/fdog/removefDog.py b/fdog/removefDog.py
index 0ea27eb..7b705ea 100644
--- a/fdog/removefDog.py
+++ b/fdog/removefDog.py
@@ -19,9 +19,9 @@
 import os
 import argparse
 import subprocess
-from pathlib import Path
 import shutil
 
+
 def query_yes_no(question, default='yes'):
     valid = {'yes': True, 'y': True, 'ye': True,
              'no': False, 'n': False}
@@ -44,6 +44,7 @@ def query_yes_no(question, default='yes'):
             sys.stdout.write('Please respond with "yes" or "no" '
                              '(or "y" or "n").\n')
 
+
 def main():
     version = '0.0.1'
     parser = argparse.ArgumentParser(description='You are running fdog.remove version ' + str(version) + '.')
@@ -81,5 +82,6 @@ def main():
 
     print('NOTE: fdog genome data are still available at %s.' % dataPath)
 
+
 if __name__ == '__main__':
     main()
diff --git a/fdog/runMulti.py b/fdog/runMulti.py
index a696495..c19b0ff 100644
--- a/fdog/runMulti.py
+++ b/fdog/runMulti.py
@@ -48,7 +48,7 @@ def prepare(args, step):
     coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
     fasoff, countercheck, coreFilter, minScore,
     strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-    cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args
+    cpu, hyperthread, checkOff, debug, silent) = args
 
     mute = False
     if step == 'core':
@@ -69,9 +69,8 @@ def prepare(args, step):
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
-    otherArgs = [cpu, hyperthread, debug, True]
-    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
-    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute)
+    otherArgs = [cpu, hyperthread, checkOff, debug, True]
+    return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute)
 
 def getSeedName(seedFile):
     seqName = seedFile.split('.')[0]
@@ -106,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-
         if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)):
             (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core')
-            coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute])
+            coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute])
     if len(coreCompilationJobs) > 0:
         pool = mp.Pool(cpu)
         coreOut = []
@@ -131,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath):
     for seed in seeds:
         seqFile = [inFol + '/' + seed]
         seqName = getSeedName(seed)
-        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
+        (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
         if mute == True:
             print(seed)
         else:
@@ -178,7 +176,7 @@ def joinOutputs(outpath, jobName, seeds, keep, silent):
 def calcFAS (outpath, extendedFa, weightpath, cpu):
     print('Starting calculating FAS scores...')
     start = time.time()
-    fasCmd = 'fdogFAS -i %s -w %s --cores %s' % (extendedFa, weightpath, cpu)
+    fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (extendedFa, weightpath, cpu)
     try:
         subprocess.call([fasCmd], shell = True)
         end = time.time()
@@ -191,7 +189,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu):
         sys.exit('Problem running\n%s' % (fasCmd))
 
 def main():
-    version = '0.0.33'
+    version = '0.0.45'
     parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
     required = parser.add_argument_group('Required arguments')
@@ -289,17 +287,10 @@ def main():
         choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int)
     optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False)
+    optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False)
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
-    assembly_options = parser.add_argument_group('Assembly options')
-    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
-    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
-    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
-    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
-    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
-    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
-    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -368,28 +359,20 @@ def main():
     # others
     cpu = args.cpu
     hyperthread = args.hyperthread
+    checkOff = args.checkOff
     debug = args.debug
     silentOff = args.silentOff
     if silentOff == True:
         silent = False
     else:
         silent = True
-       
-    #fdog_goes_assembly arguments
-    assembly = args.assembly
-    assemblyFile = args.assemblyFile
-    augustusRefSpec = args.augustusRefSpec
-    avIntron = args.avIntron
-    lengthExtension = args.lengthExtension
-    searchTool = args.searchTool
-    matrix = args.scoringmatrix
 
     ### check fas
     if not fasoff:
         try:
-            fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True)
+            fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True)
         except:
-            sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!')
+            sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!')
 
     ### delete output folder and files if needed
     if forceComplete:
@@ -403,7 +386,10 @@ def main():
             outfiles = os.listdir(outpath)
             for item in outfiles:
                 if item.startswith(jobName):
-                    os.remove(os.path.join(outpath, item))
+                    try:
+                        os.remove(os.path.join(outpath, item))
+                    except:
+                        shutil.rmtree(outpath+'/'+item)
                 if item.startswith("runtime"):
                     os.remove(os.path.join(outpath, item))
             if os.path.exists(outpath + '/missing.txt'):
@@ -465,7 +451,7 @@ def main():
                 coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation,
                 fasoff, countercheck, coreFilter, minScore,
                 strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa,
-                cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix]
+                cpu, hyperthread, checkOff, debug, silent]
 
     ### START
     Path(outpath).mkdir(parents=True, exist_ok=True)
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index a0ded09..c4abb82 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType):
         exit('%s not found' % (fdogPath + '/bin/oneSeq.pl'))
 
 def runSingle(args):
-    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args
+    (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args
     # basic command
     (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs
     cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec)
     # add paths
-    (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs
-    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath)
+    (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs
+    cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath)
     # add other I/O options
     (append, force, noCleanup, group, blast, db) = ioArgs
     if append == True:
@@ -153,36 +153,17 @@ def runSingle(args):
             if minScore > 0:
                 cmd = cmd + ' -coreFilter=%s -minScore=%s' % (coreFilter, minScore)
     # add other options
-    (cpu, hyperthread, debug, silent) = otherArgs
+    (cpu, hyperthread, checkOff, debug, silent) = otherArgs
     cmd = cmd + ' -cpu=%s' % cpu
     if hyperthread == True:
         cmd = cmd + ' -hyperthread'
+    if checkOff == True:
+        cmd = cmd + ' -checkOff'
     if debug == True:
         cmd = cmd + ' -debug'
     if silent == True:
         cmd = cmd + ' -silent'
-    # add assembly options
-    (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs
-    if assembly == True:
-        cmd = cmd + ' -assembly'
-        cmd = cmd + ' -reuseCore'
-        if not augustusRefSpec == '':
-            cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec
-        else:
-            sys.exit('An augutus reference species is requiered by using the option --assembly')
-        if not avIntron == '':
-            cmd = cmd + ' -avIntron=%s' % avIntron
-        if not lengthExtension == '':
-            cmd = cmd + ' -lengthExtension=%s' % lengthExtension
-        if not assemblyFile == '':
-            cmd = cmd + ' -assemblyFile=%s' % assemblyFile
-        if not searchTool == '':
-            cmd = cmd + ' -searchTool=%s' % searchTool
-        if not matrix == '':
-            cmd = cmd + ' -scoringmatrix=%s' % matrix
-        if not dataPath == '':
-            cmd = cmd + ' -dataPath=%s' % dataPath
-    #print(cmd)
+    # print(cmd)
     if mute == True:
         cmd = cmd + ' > /dev/null 2>&1'
     try:
@@ -191,7 +172,7 @@ def runSingle(args):
         sys.exit('Problem running\n%s' % (cmd))
 
 def main():
-    version = '0.0.33'
+    version = '0.0.45'
     parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
     required = parser.add_argument_group('Required arguments')
@@ -209,8 +190,6 @@ def main():
     optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='')
     optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='')
     optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
-    optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='')
-
 
     addtionalIO = parser.add_argument_group('Other I/O options')
     addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
@@ -289,17 +268,10 @@ def main():
         choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
     optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int)
     optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False)
+    optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False)
     optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
     optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
 
-    assembly_options = parser.add_argument_group('Assembly options')
-    assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
-    assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
-    assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
-    assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
-    assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
-    assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
-    assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
     ### get arguments
     args = parser.parse_args()
 
@@ -319,7 +291,6 @@ def main():
     searchpath = args.searchpath
     weightpath = args.weightpath
     pathFile = args.pathFile
-    assemblypath = args.assemblypath
 
     # other I/O arguments
     append = args.append
@@ -367,6 +338,7 @@ def main():
     # others
     cpu = args.cpu
     hyperthread = args.hyperthread
+    checkOff = args.checkOff
     debug = args.debug
     silentOff = args.silentOff
     if silentOff == True:
@@ -374,15 +346,6 @@ def main():
     else:
         silent = True
 
-    #fdog_goes_assembly arguments
-    assembly = args.assembly
-    assemblyFile = args.assemblyFile
-    augustusRefSpec = args.augustusRefSpec
-    avIntron = args.avIntron
-    lengthExtension = args.lengthExtension
-    searchTool = args.searchTool
-    matrix = args.scoringmatrix
-
     ### get fdog and data path
     dataPath = ''
     fdogPath = os.path.realpath(__file__).replace('/runSingle.py','')
@@ -430,30 +393,19 @@ def main():
             except:
                 sys.exit('weightpath not found in %s' % pathFile)
 
-    if assemblypath == '':
-        assemblypath = dataPath + '/assembly_dir'
-        if dataPath == 'config':
-            try:
-                assemblypath = cfg['assemblypath']
-            except:
-                sys.exit('assemblypath not found in %s' % pathFile)
-    if assembly == True:
-        searchpath = assemblypath
-
     ### check input arguments
     seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath])
     # group arguments
     basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth]
     ioArgs = [append, force, noCleanup, group, blast, db]
-    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath]
+    pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath]
     coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
     fasArgs = [fasoff, countercheck, coreFilter, minScore]
     orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
-    otherArgs = [cpu, hyperthread, debug, silent]
-    assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath]
+    otherArgs = [cpu, hyperthread, checkOff, debug, silent]
 
     ### run fdog
-    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False])
+    runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False])
 
 if __name__ == '__main__':
     main()
diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh
index 2e8ff02..1eaf176 100755
--- a/fdog/setup/install_lib.sh
+++ b/fdog/setup/install_lib.sh
@@ -154,9 +154,6 @@ perlModules=(
   List::Util
   Parallel::ForkManager
   POSIX
-  XML::SAX
-  XML::NamespaceSupport
-  XML::Parser
   Getopt::Long
   IO::Handle
   IPC::Run
diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 1f74552..28eb851 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -114,14 +114,20 @@ echo "Downloading and installing annotation tools/databases:"
 fasta36="yes"
 if [ -z "$(which fasta36)" ]; then
   fasta36="no"
-  fasta36v="fasta-36.3.8h"
+  # fasta36v="fasta-36.3.8h"
+  fasta36v="36.3.8h_04-May-2020"
   if ! [ -f "bin/aligner/bin/fasta36" ]; then
-    echo "fasta-36"
-    wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz"
-    tar xf $fasta36v.tar.gz
-    rm "${fasta36v}.tar.gz"
-    mv $fasta36v/* $CURRENT/bin/aligner/
-    rm -rf $fasta36v
+    echo "fasta36"
+    # wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz"
+    # tar xf $fasta36v.tar.gz
+    # rm "${fasta36v}.tar.gz"
+    # mv $fasta36v/* $CURRENT/bin/aligner/
+    # rm -rf $fasta36v
+    wget "https://github.com/wrpearson/fasta36/archive/refs/tags/v${fasta36v}.tar.gz"
+    tar xf "v${fasta36v}.tar.gz"
+    rm "v${fasta36v}.tar.gz"
+    mv fasta36-${fasta36v}/* $CURRENT/bin/aligner/
+    rm -rf "fasta36-${fasta36v}"
     cd "$CURRENT/bin/aligner/src"
     if [ $sys=="Linux" ]; then
       make -f ../make/Makefile.linux64_sse2 all
@@ -162,10 +168,10 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then
   exit
 fi
 
-fasPrepare=0
+setupFAS=0
 if [ $fas == 1 ]; then
   cd "$CURRENT/bin"
-  if [ -z "$(which annoFAS)" ]; then
+  if [ -z "$(which fas.doAnno)" ]; then
     echo "FAS"
     pip install --user greedyFAS
     if [ -z "$($grepprog \$HOME/.local/bin:\$PATH ~/$bashFile)" ]; then
@@ -174,22 +180,22 @@ if [ $fas == 1 ]; then
     if [ -z "$($grepprog $homedir/.local/bin ~/$rprofile)" ]; then
       echo "Sys.setenv(PATH = paste(\"$homedir/.local/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile
     fi
-    fasPrepare=1
+    setupFAS=1
   else
-    if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then
-      fasPrepare=1
+    if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then
+      setupFAS=1
     fi
   fi
 
   cd $CURRENT
   source ~/$bashFile
-  if [ -z "$(which annoFAS)" ]; then
+  if [ -z "$(which fas.doAnno)" ]; then
     echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mpip install greedyFAS\e[0m!"
     echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m"
     exit
   else
-    if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then
-      fasPrepare=1
+    if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then
+      setupFAS=1
     fi
   fi
   echo "done!"
@@ -346,9 +352,6 @@ perlModules=(
   List::Util
   Parallel::ForkManager
   POSIX
-  XML::SAX
-  XML::NamespaceSupport
-  XML::Parser
   Getopt::Long
   IO::Handle
   IPC::Run
@@ -409,9 +412,9 @@ else
   echo "-------------------------------------"
   $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl
   $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl
-  if [ "$fasPrepare" == 1 ]; then
+  if [ "$setupFAS" == 1 ]; then
     echo "All tests succeeded."
-    echo -e "\e[91mPLEASE RUN\e[0m \e[96mprepareFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m"
+    echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m"
     echo "Then you can test fdog with:"
   else
     echo "All tests succeeded, fdog should be ready to run. You can test it with:"
diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index 7b4bd08..73b8573 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -163,9 +163,6 @@ perlModules=(
   List::Util
   Parallel::ForkManager
   POSIX
-  XML::SAX
-  XML::NamespaceSupport
-  XML::Parser
   Getopt::Long
   IO::Handle
   IPC::Run
@@ -230,28 +227,28 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then
 fi
 
 cd "$CURRENT/bin"
-fasPrepare=0
-if [ -z "$(which annoFAS)" ]; then
+setupFAS=0
+if [ -z "$(which fas.doAnno)" ]; then
   echo "FAS"
   conda install -y -c BIONF fas
-  if [ -z "$(which annoFAS)" ]; then
+  if [ -z "$(which fas.doAnno)" ]; then
     echo -e "\e[31mInstallation of FAS failed! Please try again!\e[0m"
     exit
   fi
-  fasPrepare=1
+  setupFAS=1
 else
-  if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then
-    fasPrepare=1
+  if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then
+    setupFAS=1
   fi
 fi
 
-if [ -z "$(which annoFAS)" ]; then
+if [ -z "$(which fas.doAnno)" ]; then
   echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mconda install -c BIONF fas\e[0m or \e[91mpip install greedyFAS\e[0m"
   echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m"
   exit
 else
-  if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then
-    fasPrepare=1
+  if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then
+    setupFAS=1
   fi
 fi
 cd $CURRENT
@@ -435,9 +432,9 @@ else
   echo "-------------------------------------"
   $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl
   $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl
-  if [ "$fasPrepare" == 1 ]; then
+  if [ "$setupFAS" == 1 ]; then
     echo "All tests succeeded."
-    echo -e "\e[91mPLEASE RUN\e[0m \e[96msetupFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m"
+    echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m"
     echo "Then you can test fdog with:"
   else
     echo "All tests succeeded, fdog should be ready to run. You can test it with:"
diff --git a/setup.py b/setup.py
index 75573c1..b61e66b 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name="fdog",
-    version="0.0.33",
+    version="0.0.45",
 
     python_requires='>=3.7.0',
     description="Feature-aware Directed OrtholoG search tool",
@@ -43,7 +43,7 @@
         'ete3',
         'six',
         'PyYAML',
-        'greedyFAS>=1.5.0'
+        'greedyFAS>=1.11.2'
     ],
     entry_points={
         'console_scripts': ["fdog.run = fdog.runSingle:main",

From 6d7df01742ec284f9df85a4f38b5ae06a4bb1a89 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 11:34:59 +0200
Subject: [PATCH 132/192] updated help function

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e40701b..4733b4b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -769,7 +769,7 @@ def main():
     tmp = args.tmp
     strict = args.strict
     checkCoorthologs = args.checkCoorthologsRef
-    filter = args.filter
+    #
     if filter == True or filter == 'yes':
         filter = 'yes'
     else:

From ac2652b3162e8fc6d7af94ed6bb0ccea0b10053d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 11:40:22 +0200
Subject: [PATCH 133/192] updated help function

---
 fdog/fDOGassembly.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 207b50f..27a36c2 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 
 #######################################################################
-
 # Copyright (C) 2021 Hannah Muelbaier
 #
 #  This script is used to run fDOG-Assembly which performs targeted ortholog
@@ -555,7 +554,6 @@ def cleanup(tmp, tmp_path):
                 print("tmp folder could not be removed!")
                 break
 
-
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
         return candidate_names
@@ -719,6 +717,7 @@ def main():
     #################### handle user input #####################################
 
     start = time.time()
+
     version = '0.1.2'
     ################### initialize parser ######################################
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
@@ -770,11 +769,11 @@ def main():
     tmp = args.tmp
     strict = args.strict
     checkCoorthologs = args.checkCoorthologsRef
-    #
-    if filter == True or filter == 'yes':
-        filter = 'yes'
-    else:
-        filter = 'no'
+    #filter = args.filter
+    #if filter == True or filter == 'yes':
+        #filter = 'yes'
+    #else:
+        #filter = 'no'
     #others
     average_intron_length = args.avIntron
     length_extension = args.lengthExtension
@@ -972,7 +971,6 @@ def main():
     if fasoff == False:
         fas = time.time()
         print("Calculating FAS scores ...")
-
         tmp_path = out + '/tmp/'
         fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
         cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
@@ -988,9 +986,6 @@ def main():
     print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
     sys.stdout = sys.__stdout__
 
-    end = time.time()
-    sys.stdout = sys.__stdout__
-    #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
     f.close()
     cleanup(tmp, tmp_folder)
 

From 688b21e79318679690e1d88bc0e242c169be4da6 Mon Sep 17 00:00:00 2001
From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com>
Date: Tue, 19 Oct 2021 11:52:46 +0200
Subject: [PATCH 134/192] rm filter option

---
 fdog/fDOGassembly.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 12fcf6f..f7f5e05 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -770,11 +770,11 @@ def main():
     tmp = args.tmp
     strict = args.strict
     checkCoorthologs = args.checkCoorthologsRef
-    filter = args.filter
-    if filter == True or filter == 'yes':
-        filter = 'yes'
-    else:
-        filter = 'no'
+    #filter = args.filter
+    #if filter == True or filter == 'yes':
+        #filter = 'yes'
+    #else:
+        #filter = 'no'
     #others
     average_intron_length = args.avIntron
     length_extension = args.lengthExtension

From 075616852382405d6c922fde0677fdc210ca37fc Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 14:32:47 +0200
Subject: [PATCH 135/192] error handling of ValueError in function
 get_distance_biopython

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 27a36c2..d216048 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -317,7 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
-    except ValueError:
+    except get_distance_biopython.ValueError:
         print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
         return 0, "NaN", "NaN"
 

From f9d4623faa9817bb3f56672c29cf40df47110bce Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 15:36:51 +0200
Subject: [PATCH 136/192] test

---
 fdog/fDOGassembly.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d216048..adc48b2 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -317,6 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
+        print(distances)
     except get_distance_biopython.ValueError:
         print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
         return 0, "NaN", "NaN"
@@ -658,6 +659,7 @@ def ortholog_search(args):
         sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
         #cleanup(tmp, tmp_folder)
         #sys.exit()
+        sys.stdout.flush()
         return [], candidatesOutFile
     #else:
         #print("\t ...finished")
@@ -667,6 +669,7 @@ def ortholog_search(args):
     if regions == 0:
         #no candidat region are available, no ortholog can be found
         sys.stdout.write("No candidate region found for species %s!\n" % asName)
+        sys.stdout.flush()
         return [], candidatesOutFile
 
     else:
@@ -685,6 +688,7 @@ def ortholog_search(args):
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
         #print("No genes found at candidate regions\n")
+        sys.stdout.flush()
         return [], candidatesOutFile
 
     reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
@@ -692,10 +696,12 @@ def ortholog_search(args):
     if reciprocal_sequences == 0:
         if regions != 0:
             sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+        sys.stdout.flush()
         return [], candidatesOutFile
     else:
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
+    sys.stdout.flush()
     return reciprocal_sequences, candidatesOutFile
 
 class Logger(object):

From 134f94d830803c708b989d339201501ecad8ab39 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 15:46:02 +0200
Subject: [PATCH 137/192] test

---
 fdog/fDOGassembly.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index adc48b2..d6877e2 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -317,14 +317,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
-        print(distances)
+        distance_hit_query = distances[best_hit, candidate_name]
+        distance_ref_hit = distances[best_hit, ref]
+        #print(distances)
     except get_distance_biopython.ValueError:
         print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
         return 0, "NaN", "NaN"
 
 
-    distance_hit_query = distances[best_hit, candidate_name]
-    distance_ref_hit = distances[best_hit, ref]
+    #distance_hit_query = distances[best_hit, candidate_name]
+    #distance_ref_hit = distances[best_hit, ref]
 
     if distance_ref_hit < distance_hit_query:
         #accepted

From 81af9add957ca8ec3eb0257a8d9d0b2e452ab2e9 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 15:50:21 +0200
Subject: [PATCH 138/192] test

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d6877e2..111baf7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -320,7 +320,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         distance_hit_query = distances[best_hit, candidate_name]
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)
-    except get_distance_biopython.ValueError:
+    except ValueError:
         print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
         return 0, "NaN", "NaN"
 

From 1c54841813a862987790ef7940d40dccbc8a9642 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 15:51:25 +0200
Subject: [PATCH 139/192] test

---
 fdog/fDOGassembly.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 111baf7..4bd9938 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -317,6 +317,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
+        print(distances)
+        print(best_hit)
+        print(candidate_name)
         distance_hit_query = distances[best_hit, candidate_name]
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)

From 8eb12a52ca85a97a1174028ba0c9018a70459dba Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 19 Oct 2021 15:58:14 +0200
Subject: [PATCH 140/192] fixed item not found error in distance function

---
 fdog/fDOGassembly.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 4bd9938..111baf7 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -317,9 +317,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
-        print(distances)
-        print(best_hit)
-        print(candidate_name)
         distance_hit_query = distances[best_hit, candidate_name]
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)

From 326ff4259b578d479f980914d1be0bc95d8290b7 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 20 Oct 2021 10:25:11 +0200
Subject: [PATCH 141/192] cleaning up output

---
 fdog/fDOGassembly.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 111baf7..36db8a3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -244,7 +244,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                         output.write(line)
                 sequence_file.close()
             except FileNotFoundError:
-                print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
+                pass
+                #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
     output.close()
 
 def searching_for_db(assembly_path):
@@ -321,7 +322,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)
     except ValueError:
-        print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
+        #print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
         return 0, "NaN", "NaN"
 
 

From 97750b6f1fd010dc5998a7e1636a0663f7bfdcd8 Mon Sep 17 00:00:00 2001
From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com>
Date: Wed, 20 Oct 2021 11:39:36 +0200
Subject: [PATCH 142/192] Fdog goes assembly (#13)

---
 fdog/fDOGassembly.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f7f5e05..8aeec9b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -2,6 +2,7 @@
 
 #######################################################################
 
+
 # Copyright (C) 2021 Hannah Muelbaier
 #
 #  This script is used to run fDOG-Assembly which performs targeted ortholog
@@ -245,7 +246,9 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                         output.write(line)
                 sequence_file.close()
             except FileNotFoundError:
-                print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
+                pass
+                #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
+
     output.close()
 
 def searching_for_db(assembly_path):
@@ -318,13 +321,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
 
     try:
         distances = get_distance_biopython(aln_file, matrix)
+        distance_hit_query = distances[best_hit, candidate_name]
+        distance_ref_hit = distances[best_hit, ref]
+        #print(distances)
     except ValueError:
-        print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
-        return 0, "NaN", "NaN"
+        #print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
 
 
-    distance_hit_query = distances[best_hit, candidate_name]
-    distance_ref_hit = distances[best_hit, ref]
+
+    #distance_hit_query = distances[best_hit, candidate_name]
+    #distance_ref_hit = distances[best_hit, ref]
 
     if distance_ref_hit < distance_hit_query:
         #accepted
@@ -660,6 +666,8 @@ def ortholog_search(args):
         sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
         #cleanup(tmp, tmp_folder)
         #sys.exit()
+        sys.stdout.flush()
+
         return [], candidatesOutFile
     #else:
         #print("\t ...finished")
@@ -669,6 +677,8 @@ def ortholog_search(args):
     if regions == 0:
         #no candidat region are available, no ortholog can be found
         sys.stdout.write("No candidate region found for species %s!\n" % asName)
+        sys.stdout.flush()
+
         return [], candidatesOutFile
 
     else:
@@ -687,6 +697,7 @@ def ortholog_search(args):
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
         #print("No genes found at candidate regions\n")
+        sys.stdout.flush()
         return [], candidatesOutFile
 
     reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
@@ -694,10 +705,12 @@ def ortholog_search(args):
     if reciprocal_sequences == 0:
         if regions != 0:
             sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+        sys.stdout.flush()
         return [], candidatesOutFile
     else:
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
+    sys.stdout.flush()
     return reciprocal_sequences, candidatesOutFile
 
 class Logger(object):
@@ -988,9 +1001,6 @@ def main():
     print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
     sys.stdout = sys.__stdout__
 
-    end = time.time()
-    sys.stdout = sys.__stdout__
-    #print(group + "\t" + str(end-fas) + "\t" + str(end-start))
     f.close()
     cleanup(tmp, tmp_folder)
 

From a7f9e19097922f3c69921c4ed17199ae1ba83bc8 Mon Sep 17 00:00:00 2001
From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com>
Date: Wed, 20 Oct 2021 12:04:27 +0200
Subject: [PATCH 143/192] bug fix in function checkCoOrthologs

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8aeec9b..9b745db 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -325,6 +325,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)
     except ValueError:
+        pass
         #print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
 
 

From 7b8745b8d1da86606a51d779580a68009927f91c Mon Sep 17 00:00:00 2001
From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com>
Date: Wed, 20 Oct 2021 12:20:45 +0200
Subject: [PATCH 144/192] bug fix

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 9b745db..10f7aeb 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -327,6 +327,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
     except ValueError:
         pass
         #print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
+        return 0, "NaN", "NaN"
 
 
 

From c21a3f5b6ffe29c5beeb21b6a992dea15a4d02f7 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 22 Oct 2021 11:28:00 +0200
Subject: [PATCH 145/192] enabled output during parallel computation

---
 fdog/fDOGassembly.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 36db8a3..760e6d0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -631,6 +631,7 @@ def clean_fas(path, file_type):
 
 def ortholog_search(args):
     (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args
+    output = []
     cmd = 'mkdir ' + out + '/tmp/' + asName
     starting_subprocess(cmd, 'silent')
     tmp_path = out + "tmp/" + asName + "/"
@@ -639,7 +640,7 @@ def ortholog_search(args):
     fasOutFile = out + "/" + group
     #mappingFile = out + "/tmp/" + group + ".mapping.txt"
 
-    sys.stdout.write("Searching in species " + asName + "\n")
+    output.append("Searching in species " + asName + "\n")
     assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
     db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
     db_check = searching_for_db(db_path)
@@ -659,24 +660,20 @@ def ortholog_search(args):
     time_tblastn_end = time.time()
     time_tblastn = time_tblastn_end - time_tblastn_start
     if exit_code == 1:
-        sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
-        #cleanup(tmp, tmp_folder)
-        #sys.exit()
-        sys.stdout.flush()
-        return [], candidatesOutFile
+        output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName)
+        return [], candidatesOutFile, output
     #else:
         #print("\t ...finished")
-    print("Time tblastn %s in species %s" % (str(time_tblastn), asName))
+    output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName))
 
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
     if regions == 0:
         #no candidat region are available, no ortholog can be found
-        sys.stdout.write("No candidate region found for species %s!\n" % asName)
-        sys.stdout.flush()
-        return [], candidatesOutFile
+        output.append("No candidate region found for species %s!\n" % asName)
+        return [], candidatesOutFile, output
 
     else:
-        print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
+        output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
         extract_seq(regions, db_path, tmp_path, mode)
 
     ############### make Augustus PPX search ###################################
@@ -686,26 +683,23 @@ def ortholog_search(args):
     #print("\t ...finished \n")
     time_augustus_end = time.time()
     time_augustus = time_augustus_end - time_augustus_start
-    print("Time augustus: %s species %s \n" % (str(time_augustus), asName))
+    output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
         #print("No genes found at candidate regions\n")
-        sys.stdout.flush()
-        return [], candidatesOutFile
+        return [], candidatesOutFile, output
 
     reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
 
     if reciprocal_sequences == 0:
         if regions != 0:
-            sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
-        sys.stdout.flush()
-        return [], candidatesOutFile
+            output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+        return [], candidatesOutFile, output
     else:
         reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
 
-    sys.stdout.flush()
-    return reciprocal_sequences, candidatesOutFile
+    return reciprocal_sequences, candidatesOutFile, output
 
 class Logger(object):
     def __init__(self, file):
@@ -956,12 +950,14 @@ def main():
         pool.close()
         pool.join()
         for i in results:
-            ortholog_sequences.append(i)
+            print(i[2])
+            ortholog_sequences.append(i[0], i[1])
     else:
         ###################### computation species per species ################
         for asName in assembly_names:
             args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
-            reciprocal_sequences, candidatesOutFile = ortholog_search(args)
+            reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args)
+            print(output_ortholog_search)
             ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
 
     ################## preparing output ########################################

From d4374231dd228c97dd42f771f6d9b462faf2eb47 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 22 Oct 2021 11:30:49 +0200
Subject: [PATCH 146/192] enabled output during parallel computation

---
 fdog/fDOGassembly.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 08de346..42ddf69 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -664,17 +664,8 @@ def ortholog_search(args):
     time_tblastn_end = time.time()
     time_tblastn = time_tblastn_end - time_tblastn_start
     if exit_code == 1:
-<<<<<<< HEAD
         output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName)
         return [], candidatesOutFile, output
-=======
-        sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName)
-        #cleanup(tmp, tmp_folder)
-        #sys.exit()
-        sys.stdout.flush()
-
-        return [], candidatesOutFile
->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1
     #else:
         #print("\t ...finished")
     output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName))
@@ -682,15 +673,8 @@ def ortholog_search(args):
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
     if regions == 0:
         #no candidat region are available, no ortholog can be found
-<<<<<<< HEAD
         output.append("No candidate region found for species %s!\n" % asName)
         return [], candidatesOutFile, output
-=======
-        sys.stdout.write("No candidate region found for species %s!\n" % asName)
-        sys.stdout.flush()
-
-        return [], candidatesOutFile
->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1
 
     else:
         output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)

From 7a37abc0a5109147779704517eddef55135a10ba Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 22 Oct 2021 11:44:35 +0200
Subject: [PATCH 147/192] bug fix

---
 fdog/fDOGassembly.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 42ddf69..6464384 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -325,7 +325,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
         distance_ref_hit = distances[best_hit, ref]
         #print(distances)
     except ValueError:
+        pass
         #print("Failure in distance computation, Candidate  %s will be rejected" % candidate_name)
+        return 0, "NaN", "NaN"
 
 
 

From 02f004671375ebf02c9bc0a607723f6409a9150f Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 22 Oct 2021 11:56:50 +0200
Subject: [PATCH 148/192] improved output

---
 fdog/fDOGassembly.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 6464384..56de5f1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -955,15 +955,17 @@ def main():
         pool.close()
         pool.join()
         for i in results:
-            print(i[2])
             ortholog_sequences.append(i[0], i[1])
+            for k in i[2]:
+                print(k)
     else:
         ###################### computation species per species ################
         for asName in assembly_names:
             args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
             reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args)
-            print(output_ortholog_search)
             ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
+            for k in output_ortholog_search:
+                print(k)
 
     ################## preparing output ########################################
     orthologsOutFile = out + "/" + group + ".extended.fa"

From 52feba3fdc5d50a9d2f14953297fad5381091531 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 22 Oct 2021 12:09:58 +0200
Subject: [PATCH 149/192] improved output

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 56de5f1..ad10cc8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -955,7 +955,7 @@ def main():
         pool.close()
         pool.join()
         for i in results:
-            ortholog_sequences.append(i[0], i[1])
+            ortholog_sequences.append([i[0], i[1]])
             for k in i[2]:
                 print(k)
     else:

From 9c228b2865d1682f2040250f5f4107f11c8d11c4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 25 Oct 2021 13:23:43 +0200
Subject: [PATCH 150/192] a file can be used as input for --searchTaxa

---
 fdog/fDOGassembly.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ad10cc8..dbd49e0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -880,12 +880,24 @@ def main():
     if searchTaxa == []:
         assembly_names = os.listdir(assemblyDir)
     else:
-        assembly_names = os.listdir(assemblyDir)
-        for Taxon in searchTaxa:
-            if Taxon not in assembly_names:
-                print("Taxon %s is not in the assembly_dir" % Taxon)
-                sys.exit()
-        assembly_names = searchTaxa
+        if len(searchTaxa) > 1:
+            assembly_names = os.listdir(assemblyDir)
+            for Taxon in searchTaxa:
+                if Taxon not in assembly_names:
+                    print("Taxon %s is not in the assembly_dir" % Taxon)
+                    sys.exit()
+            assembly_names = searchTaxa
+        else:
+            if searchTaxa[0] in assembly_names:
+                assembly_names = searchTaxa
+            elif os.path.isfile(searchTaxa[0]):
+                with open(searchTaxa[0]) as file:
+                    lines = file.readlines()
+                    assembly_names = [line.rstrip() for line in lines]
+            else:
+                print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0])
+
+
 
     ################################# paths ####################################
 

From fdb30730476e611d74e0ed8d527ef8711821a7d9 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 25 Oct 2021 13:39:19 +0200
Subject: [PATCH 151/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index dee4ab4..fc510c4 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -889,7 +889,7 @@ def main():
                     sys.exit()
             assembly_names = searchTaxa
         else:
-            if searchTaxa[0] in assembly_names:
+            if searchTaxa[0] in os.listdir(assemblyDir):
                 assembly_names = searchTaxa
             elif os.path.isfile(searchTaxa[0]):
                 with open(searchTaxa[0]) as file:

From f43820e9fc66ec930e89e50ffeba679d5b9f43cd Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 26 Oct 2021 16:55:41 +0200
Subject: [PATCH 152/192] fixed bug in searching_for_db

---
 fdog/fDOGassembly.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index fc510c4..1b44ea9 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -257,6 +257,11 @@ def searching_for_db(assembly_path):
     check = True
     for end in db_endings:
         check = check and os.path.exists(assembly_path + end)
+
+    if check == False:
+        check = True
+        for end in db_endings:
+            check = check and os.path.exists(assembly_path + '.00.' + end)
     return check
 
 def get_distance_biopython(file, matrix):
@@ -563,7 +568,6 @@ def cleanup(tmp, tmp_path):
                 print("tmp folder could not be removed!")
                 break
 
-
 def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
     if len(candidate_names) == 1:
         return candidate_names

From 7d12ffa28c25f2115ad6005e9d4bf7071508023c Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 26 Oct 2021 17:10:06 +0200
Subject: [PATCH 153/192] fixed bug in function searching_for_db

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 1b44ea9..fdb90fa 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -261,7 +261,7 @@ def searching_for_db(assembly_path):
     if check == False:
         check = True
         for end in db_endings:
-            check = check and os.path.exists(assembly_path + '.00.' + end)
+            check = check and os.path.exists(assembly_path + '.00' + end)
     return check
 
 def get_distance_biopython(file, matrix):

From 110073f4e00da8795dd43cbec28ac12b9d90b4f4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 26 Oct 2021 17:22:32 +0200
Subject: [PATCH 154/192] bug fix searching_for_db function

---
 fdog/fDOGassembly.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index fdb90fa..2b9e6fb 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -256,12 +256,8 @@ def searching_for_db(assembly_path):
     db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto']
     check = True
     for end in db_endings:
-        check = check and os.path.exists(assembly_path + end)
-
-    if check == False:
-        check = True
-        for end in db_endings:
-            check = check and os.path.exists(assembly_path + '.00' + end)
+        if not any(File.endswith(end) for File in os.listdir(assembly_path)):
+            check = False
     return check
 
 def get_distance_biopython(file, matrix):

From afd28c60bf071f9d2943b6ebcb18ee2c4dcd0c09 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 26 Oct 2021 17:28:05 +0200
Subject: [PATCH 155/192] testing

---
 fdog/fDOGassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 2b9e6fb..b92cefc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -649,7 +649,8 @@ def ortholog_search(args):
     output.append("Searching in species " + asName + "\n")
     assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
     db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-    db_check = searching_for_db(db_path)
+    blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName
+    db_check = searching_for_db(blast_dir_path)
 
     if db_check == 0:
         #print("Creating a blast data base...")

From 6076c5da9bf5f4abfca1a724dc142b763c46e674 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 26 Oct 2021 17:29:55 +0200
Subject: [PATCH 156/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b92cefc..d220039 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -649,7 +649,7 @@ def ortholog_search(args):
     output.append("Searching in species " + asName + "\n")
     assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
     db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
-    blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName
+    blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/"
     db_check = searching_for_db(blast_dir_path)
 
     if db_check == 0:

From 2f38455330bef1e8b63a2d8ab0c1aed375c7c479 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 9 Feb 2022 14:38:49 +0100
Subject: [PATCH 157/192] reorganization of code to enable the use of metaeuk
 as an alternative to Augustus

---
 fdog/.DS_Store       | Bin 8196 -> 8196 bytes
 fdog/fDOGassembly.py | 145 ++++++++++++++++++++++++++-----------------
 2 files changed, 89 insertions(+), 56 deletions(-)

diff --git a/fdog/.DS_Store b/fdog/.DS_Store
index 34e42555d35fd3e0f289e49c57c3fa62ffc1f870..a99a01c231b8aab3b888fe9e4dacf4b66808b3f0 100644
GIT binary patch
delta 40
wcmZp1XmOa}&nU7nU^hRb$YvgaaOTbHg(FxdHu!92m-xoA*;8~M)5Hc(01vhes{jB1

delta 69
zcmZp1XmOa}&nUVvU^hRb=w=>)aAs*ShFpeJh9ZV^AnC|Z41}pbktBv3hRVr#!U{~x
V&YMGo(^)pNOMGM5yitUm8312<5m*2K

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d220039..8a9af97 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 
 #######################################################################
-
-
 # Copyright (C) 2021 Hannah Muelbaier
 #
 #  This script is used to run fDOG-Assembly which performs targeted ortholog
@@ -635,8 +633,8 @@ def clean_fas(path, file_type):
         file.write(new_line)
     file.close()
 
-def ortholog_search(args):
-    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args
+def ortholog_search_tblastn(args):
+    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args
     output = []
     cmd = 'mkdir ' + out + '/tmp/' + asName
     starting_subprocess(cmd, 'silent')
@@ -670,8 +668,6 @@ def ortholog_search(args):
         output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName)
         return [], candidatesOutFile, output
 
-    #else:
-        #print("\t ...finished")
     output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName))
 
     regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
@@ -684,14 +680,18 @@ def ortholog_search(args):
         output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
         extract_seq(regions, db_path, tmp_path, mode)
 
-    ############### make Augustus PPX search ###################################
-    #print("Starting augustus ppx ...")
-    time_augustus_start = time.time()
-    augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-    #print("\t ...finished \n")
-    time_augustus_end = time.time()
-    time_augustus = time_augustus_end - time_augustus_start
-    output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
+
+    if gene_prediction == "augustus":
+        ############### make Augustus PPX search ###################################
+        #print("Starting augustus ppx ...")
+        time_augustus_start = time.time()
+        augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+        #print("\t ...finished \n")
+        time_augustus_end = time.time()
+        time_augustus = time_augustus_end - time_augustus_start
+        output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
+    else:
+        print("test")
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
@@ -709,6 +709,48 @@ def ortholog_search(args):
 
     return reciprocal_sequences, candidatesOutFile, output
 
+def blockProfiles(core_path, group, mode):
+
+    ######################## paths ################################
+    msa_path = core_path + "/" + group +"/"+ group + ".aln"
+    check_path(msa_path)
+    profile_path = out + "/tmp/" + group + ".prfl"
+
+    ######################## block profile #####################################
+
+    print("Building a block profile ...")
+    cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
+    starting_subprocess(cmd, 'silent')
+
+    if int(os.path.getsize(profile_path)) > 0:
+        print("\t ...finished \n")
+    else:
+        print("Building block profiles failed. Using prepareAlign to convert alignment\n")
+        new_path = core_path + group +"/"+ group + "_new.aln"
+        cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
+        starting_subprocess(cmd, mode)
+        cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
+        starting_subprocess(cmd, 'silent')
+        print(" \t ...finished \n")
+
+    return profile_path
+
+def consensusSequence(core_path, group, mode):
+
+    ######################## paths ################################
+    hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
+    check_path(hmm_path)
+    consensus_path = out + "/tmp/" + group + ".con"
+
+    ######################## consensus sequence ################################
+    #make a majority-rule consensus sequence with the tool hmmemit from hmmer
+    print("Building a consensus sequence")
+    cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
+    starting_subprocess(cmd, mode)
+    print("\t ...finished\n")
+
+    return consensus_path
+
 class Logger(object):
     def __init__(self, file):
         self.file = file
@@ -722,7 +764,6 @@ def write(self, message):
     def flush(self):
         pass
 
-
 def main():
 
     #################### handle user input #####################################
@@ -736,7 +777,6 @@ def main():
     required = parser.add_argument_group('Required arguments')
     required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
                             action='store', default='', required=True)
-    required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
     required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True)
     ################## optional arguments ######################################
     optional = parser.add_argument_group('Optional arguments')
@@ -763,11 +803,12 @@ def main():
     optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
     optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
     optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False)
+    optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False)
+    optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
     args = parser.parse_args()
 
     # required
     group = args.gene
-    augustus_ref_species = args.augustusRefSpec
     fdog_ref_species = args.refSpec
     #paths user input
     assemblyDir = args.assemblyPath
@@ -800,6 +841,18 @@ def main():
     append = args.append
     parallel = args.parallel
 
+    #gene prediction tool
+    augustus = args.augustus
+    if augutus == True:
+        augustus_ref_species = args.augustusRefSpec
+        if augustus_ref_species == '':
+            print("Augustus reference species is required when using Augustus as gene prediction tool")
+            return 1
+        gene_prediction = "augustus"
+    else:
+        gene_prediction = "metaeuk"
+
+
     # output modes
     if debug == True and silent == True:
         print("It's not possible to use booth modes, please restart and use --debug or --silent")
@@ -903,14 +956,8 @@ def main():
 
     ################################# paths ####################################
 
-    msa_path = core_path + "/" + group +"/"+ group + ".aln"
-    check_path(msa_path)
-    hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
-    check_path(hmm_path)
     fasta_path = core_path + "/" + group +"/"+ group + ".fa"
     check_path(fasta_path)
-    consensus_path = out + "/tmp/" + group + ".con"
-    profile_path = out + "/tmp/" + group + ".prfl"
     tmp_folder = out + "/tmp"
 
     ########### is/are fDOG reference species part of ortholog group? ##########
@@ -925,47 +972,30 @@ def main():
     print("Gene: " + group)
     print("fDOG reference species: " + fdog_ref_species + " \n")
 
-    ######################## consensus sequence ################################
-    group_computation_time_start = time.time()
-    #make a majority-rule consensus sequence with the tool hmmemit from hmmer
-    print("Building a consensus sequence")
-    cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
-    starting_subprocess(cmd, mode)
-    print("\t ...finished\n")
+    ###################### preparations ########################################
 
-    ######################## block profile #####################################
+    if augustus == True:
+        group_computation_time_start = time.time()
+        consensus_path = consensusSequence(core_path, group, mode)
+        profile_path = blockProfiles(core_path, group, mode)
+        group_computation_time_end = time.time()
+        time_group = group_computation_time_end - group_computation_time_start
 
-    print("Building a block profile ...")
-    cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
-    starting_subprocess(cmd, 'silent')
-
-    if int(os.path.getsize(profile_path)) > 0:
-        print("\t ...finished \n")
-    else:
-        print("Building block profiles failed. Using prepareAlign to convert alignment\n")
-        new_path = core_path + group +"/"+ group + "_new.aln"
-        cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
-        starting_subprocess(cmd, mode)
-        cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
-        starting_subprocess(cmd, 'silent')
-        print(" \t ...finished \n")
-
-    group_computation_time_end = time.time()
-    time_group = group_computation_time_end - group_computation_time_start
 
     ###################### ortholog search #####################################
 
     ortholog_sequences = []
     time_ortholog_start = time.time()
+
     if parallel == True:
-        ##################### parallel compuataion #############################
+        ##################### parallel computation #############################
         calls = []
         cpus = mp.cpu_count()
         pool = mp.Pool(cpus)
         for asName in assembly_names:
-            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs])
+            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction])
 
-        results = (pool.imap_unordered(ortholog_search, calls))
+        results = (pool.imap_unordered(ortholog_search_tblastn, calls))
         pool.close()
         pool.join()
         for i in results:
@@ -973,18 +1003,20 @@ def main():
             for k in i[2]:
                 print(k)
     else:
-        ###################### computation species per species ################
+        ###################### computation species wise ################
         for asName in assembly_names:
-            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]
-            reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args)
+            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]
+            reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args)
             ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
             for k in output_ortholog_search:
                 print(k)
 
-    ################## preparing output ########################################
-    orthologsOutFile = out + "/" + group + ".extended.fa"
     time_ortholog_end = time.time()
     time_ortholog = time_ortholog_end - time_ortholog_start
+
+    ################## preparing output ########################################
+    orthologsOutFile = out + "/" + group + ".extended.fa"
+
     if taxa == []:
         taxa = [fdog_ref_species]
     if append == True:
@@ -1006,6 +1038,7 @@ def main():
         clean_fas(out + group + "_reverse.domains", 'domains')
         clean_fas(out + group + ".phyloprofile", 'phyloprofile')
         print("\t ...finished \n")
+
     ################# remove tmp folder ########################################
     end = time.time()
     time_fas = end - fas

From e088dff0ac04bd6fd5aa27aedf38af1502eda834 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 9 Feb 2022 16:05:30 +0100
Subject: [PATCH 158/192] included metaeuk

---
 fdog/fDOGassembly.py | 74 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8a9af97..f12e9cc 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -212,6 +212,20 @@ def extract_seq(region_dic, path, tmp_path, mode):
         cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
         starting_subprocess(cmd, mode)
 
+def extract_sequence_from_to(name, file, start, end):
+    out = name + ".fasta"
+    if start < 0:
+        start = 0
+    with open(out,"w") as f:
+        for seq_record in SeqIO.parse(file, "fasta"):
+                f.write(str(seq_record.id) + "\n")
+                sequence_length = len(seq_record.seq)
+                if end > sequence_length:
+                    end = sequence_length
+                f.write(str(seq_record.seq[start:end]) + "\n")
+
+    return out, start, end
+
 def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode):
     output = open(candidatesOutFile, "w")
 
@@ -246,9 +260,43 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
             except FileNotFoundError:
                 pass
                 #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
-
     output.close()
 
+def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group):
+    output = open(candidatesOutFile, "w")
+
+    for key in regions:
+        locations = regions[key]
+        counter = 0
+        for i in locations:
+            #some variables
+            counter += 1
+            start = str(i[0] - length_extension)
+            end = str(i[1] + length_extension)
+            name = key + "_" + str(counter)
+            file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
+            #metaeuk call
+            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk"
+            print(cmd)
+            starting_subprocess(cmd, mode)
+            # parsing header and sequences
+            try:
+                sequence_file = open(tmp_path + name + ".fas", "r")
+                lines = sequence_file.readlines()
+                id = 0
+                for line in lines:
+                    if line[0] == ">":
+                        id += 1
+                        header = ">" + group + "|" + ass_name + "|" + name + "_" + id
+                        output.write(header)
+                    else:
+                        output.write(line)
+                sequence_file.close()
+            except FileNotFoundError:
+                pass
+
+        output.close()
+
 def searching_for_db(assembly_path):
 
     db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto']
@@ -473,8 +521,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
                     #print("No ortholog was found with option --strict")
                     return 0, seed
 
-
-
     #print(orthologs)
     orthologs = set(orthologs)
     return list(orthologs), seed
@@ -651,14 +697,11 @@ def ortholog_search_tblastn(args):
     db_check = searching_for_db(blast_dir_path)
 
     if db_check == 0:
-        #print("Creating a blast data base...")
         cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
         starting_subprocess(cmd, mode)
-        #print("\t ...finished \n")
 
     #makes a tBLASTn search against database
     #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
-    #print("Starting tBLASTn search...")
     cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
     time_tblastn_start = time.time()
     exit_code = starting_subprocess(cmd, mode, 3600)
@@ -683,15 +726,17 @@ def ortholog_search_tblastn(args):
 
     if gene_prediction == "augustus":
         ############### make Augustus PPX search ###################################
-        #print("Starting augustus ppx ...")
         time_augustus_start = time.time()
         augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
-        #print("\t ...finished \n")
         time_augustus_end = time.time()
         time_augustus = time_augustus_end - time_augustus_start
         output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
     else:
-        print("test")
+        time_metaeuk_start = time.time()
+        metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path)
+        time_metaeuk_end = time.time()
+        time_metaeuk = time_metaeuk_end - time_metaeuk_start
+        output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))")
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:
@@ -820,11 +865,6 @@ def main():
     tmp = args.tmp
     strict = args.strict
     checkCoorthologs = args.checkCoorthologsRef
-    #filter = args.filter
-    #if filter == True or filter == 'yes':
-        #filter = 'yes'
-    #else:
-        #filter = 'no'
     #others
     average_intron_length = args.avIntron
     length_extension = args.lengthExtension
@@ -852,7 +892,6 @@ def main():
     else:
         gene_prediction = "metaeuk"
 
-
     # output modes
     if debug == True and silent == True:
         print("It's not possible to use booth modes, please restart and use --debug or --silent")
@@ -952,8 +991,6 @@ def main():
             else:
                 print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0])
 
-
-
     ################################# paths ####################################
 
     fasta_path = core_path + "/" + group +"/"+ group + ".fa"
@@ -980,6 +1017,9 @@ def main():
         profile_path = blockProfiles(core_path, group, mode)
         group_computation_time_end = time.time()
         time_group = group_computation_time_end - group_computation_time_start
+    else:
+        print("test")
+        #concatinade core_group sequences if metaeuk should be run without tblastn
 
 
     ###################### ortholog search #####################################

From 5cb0f2bba80f33ae3b35861b7891a14ff6ae34ce Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:26:37 +0100
Subject: [PATCH 159/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f12e9cc..f891b47 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -736,7 +736,7 @@ def ortholog_search_tblastn(args):
         metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path)
         time_metaeuk_end = time.time()
         time_metaeuk = time_metaeuk_end - time_metaeuk_start
-        output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))")
+        output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))
 
     ################# backward search to filter for orthologs###################
     if int(os.path.getsize(candidatesOutFile)) <= 0:

From cb085c71af0bda7eb2f7907f0c6a01fa4719f00d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:31:14 +0100
Subject: [PATCH 160/192] bug fix

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f891b47..64192b1 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -814,7 +814,7 @@ def main():
     #################### handle user input #####################################
 
     start = time.time()
-    version = '0.1.2'
+    version = '0.1.3'
     ################### initialize parser ######################################
     parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
     parser.add_argument('--version', action='version', version=str(version))
@@ -883,7 +883,7 @@ def main():
 
     #gene prediction tool
     augustus = args.augustus
-    if augutus == True:
+    if augustus == True:
         augustus_ref_species = args.augustusRefSpec
         if augustus_ref_species == '':
             print("Augustus reference species is required when using Augustus as gene prediction tool")

From 0d2d26db84d471960cf9e61e18d7721befce253c Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:36:11 +0100
Subject: [PATCH 161/192] bug fix

---
 fdog/fDOGassembly.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 64192b1..f68c3aa 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -754,7 +754,7 @@ def ortholog_search_tblastn(args):
 
     return reciprocal_sequences, candidatesOutFile, output
 
-def blockProfiles(core_path, group, mode):
+def blockProfiles(core_path, group, mode, out):
 
     ######################## paths ################################
     msa_path = core_path + "/" + group +"/"+ group + ".aln"
@@ -780,7 +780,7 @@ def blockProfiles(core_path, group, mode):
 
     return profile_path
 
-def consensusSequence(core_path, group, mode):
+def consensusSequence(core_path, group, mode, out):
 
     ######################## paths ################################
     hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
@@ -1013,8 +1013,8 @@ def main():
 
     if augustus == True:
         group_computation_time_start = time.time()
-        consensus_path = consensusSequence(core_path, group, mode)
-        profile_path = blockProfiles(core_path, group, mode)
+        consensus_path = consensusSequence(core_path, group, mode, out)
+        profile_path = blockProfiles(core_path, group, mode, out)
         group_computation_time_end = time.time()
         time_group = group_computation_time_end - group_computation_time_start
     else:

From 8d9ce6015e2b3a395d546b1f0033e918f0e3e1d2 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:43:01 +0100
Subject: [PATCH 162/192] added preparation steps for metaeuk (tblastn search
 preparation)

---
 fdog/fDOGassembly.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f68c3aa..aa037e8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1019,7 +1019,11 @@ def main():
         time_group = group_computation_time_end - group_computation_time_start
     else:
         print("test")
+        group_computation_time_start = time.time()
+        consensus_path = consensusSequence(core_path, group, mode, out)
         #concatinade core_group sequences if metaeuk should be run without tblastn
+        group_computation_time_end = time.time()
+        time_group = group_computation_time_end - group_computation_time_start
 
 
     ###################### ortholog search #####################################

From 65c8835fd080a227dc19f0f51dad39668e114130 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:45:28 +0100
Subject: [PATCH 163/192] bug fix

---
 fdog/fDOGassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index aa037e8..c82e8fb 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -880,11 +880,12 @@ def main():
     force = args.force
     append = args.append
     parallel = args.parallel
+    augustus_ref_species = args.augustusRefSpec
 
     #gene prediction tool
     augustus = args.augustus
     if augustus == True:
-        augustus_ref_species = args.augustusRefSpec
+
         if augustus_ref_species == '':
             print("Augustus reference species is required when using Augustus as gene prediction tool")
             return 1

From 83275925f7e71b0d8b0609b79b89216a46b3084d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:47:26 +0100
Subject: [PATCH 164/192] bug fix

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c82e8fb..8bbfeba 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1020,6 +1020,7 @@ def main():
         time_group = group_computation_time_end - group_computation_time_start
     else:
         print("test")
+        profile_path = ""
         group_computation_time_start = time.time()
         consensus_path = consensusSequence(core_path, group, mode, out)
         #concatinade core_group sequences if metaeuk should be run without tblastn

From fb62700935cb87d4d03b32ca0ecc36346ee02037 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:48:47 +0100
Subject: [PATCH 165/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 8bbfeba..11a8504 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -733,7 +733,7 @@ def ortholog_search_tblastn(args):
         output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
     else:
         time_metaeuk_start = time.time()
-        metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path)
+        metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path)
         time_metaeuk_end = time.time()
         time_metaeuk = time_metaeuk_end - time_metaeuk_start
         output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))

From f5e25dbc5fb65596bd65312a3e6d6feb83529653 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:51:55 +0100
Subject: [PATCH 166/192] bug fix

---
 fdog/fDOGassembly.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 11a8504..54294d4 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -214,15 +214,15 @@ def extract_seq(region_dic, path, tmp_path, mode):
 
 def extract_sequence_from_to(name, file, start, end):
     out = name + ".fasta"
-    if start < 0:
+    if int(start) < 0:
         start = 0
     with open(out,"w") as f:
         for seq_record in SeqIO.parse(file, "fasta"):
                 f.write(str(seq_record.id) + "\n")
                 sequence_length = len(seq_record.seq)
-                if end > sequence_length:
+                if int(end) > sequence_length:
                     end = sequence_length
-                f.write(str(seq_record.seq[start:end]) + "\n")
+                f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end
 

From e59ae539a7e4a679058c1d2535aa53809b9ccb5e Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:54:08 +0100
Subject: [PATCH 167/192] bug fix

---
 fdog/fDOGassembly.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 54294d4..990bbd0 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -213,6 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode):
         starting_subprocess(cmd, mode)
 
 def extract_sequence_from_to(name, file, start, end):
+    print(name)
     out = name + ".fasta"
     if int(start) < 0:
         start = 0
@@ -222,6 +223,8 @@ def extract_sequence_from_to(name, file, start, end):
                 sequence_length = len(seq_record.seq)
                 if int(end) > sequence_length:
                     end = sequence_length
+                    print(start)
+                    print(end)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end

From 188ae4b8a54866978b984335042e74b0d0b9ecc3 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 11:58:05 +0100
Subject: [PATCH 168/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 990bbd0..233d8f5 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -219,7 +219,7 @@ def extract_sequence_from_to(name, file, start, end):
         start = 0
     with open(out,"w") as f:
         for seq_record in SeqIO.parse(file, "fasta"):
-                f.write(str(seq_record.id) + "\n")
+                f.write(">" + str(seq_record.id) + "\n")
                 sequence_length = len(seq_record.seq)
                 if int(end) > sequence_length:
                     end = sequence_length

From 93e79fea116a8387aa8d5df5b08b7b143ada2078 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 12:03:07 +0100
Subject: [PATCH 169/192] bug fix

---
 fdog/fDOGassembly.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 233d8f5..11091da 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -213,7 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode):
         starting_subprocess(cmd, mode)
 
 def extract_sequence_from_to(name, file, start, end):
-    print(name)
+    #print(name)
     out = name + ".fasta"
     if int(start) < 0:
         start = 0
@@ -223,8 +223,8 @@ def extract_sequence_from_to(name, file, start, end):
                 sequence_length = len(seq_record.seq)
                 if int(end) > sequence_length:
                     end = sequence_length
-                    print(start)
-                    print(end)
+                    #print(start)
+                    #print(end)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end
@@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
             cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk"
-            print(cmd)
+            #print(cmd)
             starting_subprocess(cmd, mode)
             # parsing header and sequences
             try:
@@ -290,7 +290,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                 for line in lines:
                     if line[0] == ">":
                         id += 1
-                        header = ">" + group + "|" + ass_name + "|" + name + "_" + id
+                        header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id)
                         output.write(header)
                     else:
                         output.write(line)

From 90eb408d967041e1d3f1960c8ebfe2745853d1ed Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:04:42 +0100
Subject: [PATCH 170/192] testing other paramteres for metaeuk

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 11091da..25b4a6c 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -279,7 +279,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             name = key + "_" + str(counter)
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
-            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk"
+            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
             #print(cmd)
             starting_subprocess(cmd, mode)
             # parsing header and sequences

From ca4133aa4ab7389d8c4827d8ebc6702988609e26 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:20:27 +0100
Subject: [PATCH 171/192] testing new parameters

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 25b4a6c..f35c80c 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -286,6 +286,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             try:
                 sequence_file = open(tmp_path + name + ".fas", "r")
                 lines = sequence_file.readlines()
+                print(lines)
                 id = 0
                 for line in lines:
                     if line[0] == ">":

From 6be72527e89676e3f1a89ffb8db492771d198307 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:30:19 +0100
Subject: [PATCH 172/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index f35c80c..a8995fa 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -299,7 +299,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             except FileNotFoundError:
                 pass
 
-        output.close()
+    output.close()
 
 def searching_for_db(assembly_path):
 

From 926963f369aeebc3bfeb5160574961061da90777 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:44:33 +0100
Subject: [PATCH 173/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index a8995fa..0836198 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -291,7 +291,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                 for line in lines:
                     if line[0] == ">":
                         id += 1
-                        header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id)
+                        header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n"
                         output.write(header)
                     else:
                         output.write(line)

From 062eefcc7fc94bba111c1c1e977d2fd8a3f4caec Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:44:38 +0100
Subject: [PATCH 174/192] testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 0836198..48a6f85 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
             cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
-            #print(cmd)
+            print(cmd)
             starting_subprocess(cmd, mode)
             # parsing header and sequences
             try:

From 49c080e1b76bb65e89268ba46a52dc86d06e4ffc Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 13:54:56 +0100
Subject: [PATCH 175/192] testing

---
 fdog/fDOGassembly.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 48a6f85..ebca99e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -223,8 +223,9 @@ def extract_sequence_from_to(name, file, start, end):
                 sequence_length = len(seq_record.seq)
                 if int(end) > sequence_length:
                     end = sequence_length
-                    #print(start)
-                    #print(end)
+                #for testing only
+                start = 0
+                end = len(seq_record.seq)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end

From fb8e97aff28edb0d827ccef10890a8997e9ec1b0 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 14:06:25 +0100
Subject: [PATCH 176/192] testing

---
 fdog/fDOGassembly.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ebca99e..d22b281 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end):
                 if int(end) > sequence_length:
                     end = sequence_length
                 #for testing only
-                start = 0
-                end = len(seq_record.seq)
+                #start = 0
+                #end = len(seq_record.seq)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end
@@ -281,13 +281,13 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
             cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
-            print(cmd)
+            #print(cmd)
             starting_subprocess(cmd, mode)
             # parsing header and sequences
             try:
                 sequence_file = open(tmp_path + name + ".fas", "r")
                 lines = sequence_file.readlines()
-                print(lines)
+                #print(lines)
                 id = 0
                 for line in lines:
                     if line[0] == ">":

From be1b56a32c98610b5f8360fd20f1f777e8875b1f Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 14:42:37 +0100
Subject: [PATCH 177/192] metaeuk is incldued and running in fdog_assembly
 workflow

---
 fdog/fDOGassembly.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d22b281..40c63f8 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -280,8 +280,9 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             name = key + "_" + str(counter)
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
-            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
+            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk"
             #print(cmd)
+            # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1
             starting_subprocess(cmd, mode)
             # parsing header and sequences
             try:
@@ -1024,7 +1025,7 @@ def main():
         group_computation_time_end = time.time()
         time_group = group_computation_time_end - group_computation_time_start
     else:
-        print("test")
+        #print("test")
         profile_path = ""
         group_computation_time_start = time.time()
         consensus_path = consensusSequence(core_path, group, mode, out)

From cb9a5fd6c0e23f6907dd8a056bc2fe1dc2736d96 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 15:07:16 +0100
Subject: [PATCH 178/192] testing other metaeuk parameters

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 40c63f8..edaaffe 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             name = key + "_" + str(counter)
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
-            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk"
+            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
             #print(cmd)
             # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1
             starting_subprocess(cmd, mode)

From 79791e8f52c95ea2e2e62d228081225508eca07f Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 15:20:37 +0100
Subject: [PATCH 179/192] using complete contigs for metaeuk

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index edaaffe..c837c33 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end):
                 if int(end) > sequence_length:
                     end = sequence_length
                 #for testing only
-                #start = 0
-                #end = len(seq_record.seq)
+                start = 0
+                end = len(seq_record.seq)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end

From f6f72f7e0a5b3628045449afc9a350a542e1c339 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 10 Feb 2022 15:34:36 +0100
Subject: [PATCH 180/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index c837c33..edaaffe 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end):
                 if int(end) > sequence_length:
                     end = sequence_length
                 #for testing only
-                start = 0
-                end = len(seq_record.seq)
+                #start = 0
+                #end = len(seq_record.seq)
                 f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
 
     return out, start, end

From 61a1ee54036074d2d3079766dae26a1bd1a2b300 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 11 Feb 2022 11:41:27 +0100
Subject: [PATCH 181/192] added parameter for own metaeuk db

---
 fdog/fDOGassembly.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index edaaffe..20b74e3 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -266,7 +266,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
                 #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
     output.close()
 
-def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group):
+def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db):
     output = open(candidatesOutFile, "w")
 
     for key in regions:
@@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             name = key + "_" + str(counter)
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
             #metaeuk call
-            cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
+            cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
             #print(cmd)
             # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1
             starting_subprocess(cmd, mode)
@@ -686,7 +686,7 @@ def clean_fas(path, file_type):
     file.close()
 
 def ortholog_search_tblastn(args):
-    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args
+    (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args
     output = []
     cmd = 'mkdir ' + out + '/tmp/' + asName
     starting_subprocess(cmd, 'silent')
@@ -739,7 +739,11 @@ def ortholog_search_tblastn(args):
         output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
     else:
         time_metaeuk_start = time.time()
-        metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path)
+        if metaeuk_db == '':
+            db = fasta_path
+        else:
+            db = metaeuk_db
+        metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db)
         time_metaeuk_end = time.time()
         time_metaeuk = time_metaeuk_end - time_metaeuk_start
         output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))
@@ -856,6 +860,7 @@ def main():
     optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False)
     optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False)
     optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
+    optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='')
     args = parser.parse_args()
 
     # required
@@ -887,6 +892,7 @@ def main():
     append = args.append
     parallel = args.parallel
     augustus_ref_species = args.augustusRefSpec
+    metaeuk_db = args.metaeukDb
 
     #gene prediction tool
     augustus = args.augustus
@@ -964,6 +970,12 @@ def main():
         assemblyDir = dataPath + '/assembly_dir/'
     check_path(assemblyDir)
 
+    if metaeuk_db != '':
+        if not metaeuk_db.endswith('/'):
+            metaeuk_db = metaeuk_db + '/'
+        check_path(metaeuk_db)
+
+
     try:
         f = open(out + "/fdog.log", "a+")
     except FileNotFoundError:
@@ -1045,7 +1057,7 @@ def main():
         cpus = mp.cpu_count()
         pool = mp.Pool(cpus)
         for asName in assembly_names:
-            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction])
+            calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db])
 
         results = (pool.imap_unordered(ortholog_search_tblastn, calls))
         pool.close()
@@ -1057,7 +1069,7 @@ def main():
     else:
         ###################### computation species wise ################
         for asName in assembly_names:
-            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]
+            args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]
             reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args)
             ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
             for k in output_ortholog_search:

From 81ec9a562d52b9546fd4c7161e89725b9e23783a Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Fri, 11 Feb 2022 11:47:31 +0100
Subject: [PATCH 182/192] bugfix

---
 fdog/fDOGassembly.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 20b74e3..daf8bff 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -971,8 +971,6 @@ def main():
     check_path(assemblyDir)
 
     if metaeuk_db != '':
-        if not metaeuk_db.endswith('/'):
-            metaeuk_db = metaeuk_db + '/'
         check_path(metaeuk_db)
 
 

From 17a546a155cf5efa09f7c8e16c888a10a9d65615 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 14 Feb 2022 14:40:39 +0100
Subject: [PATCH 183/192] for debugging function get_distance_biopython

---
 fdog/fDOGassembly.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index daf8bff..4a05627 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -313,6 +313,7 @@ def searching_for_db(assembly_path):
     return check
 
 def get_distance_biopython(file, matrix):
+    print(file)
     aln = AlignIO.read(open(file), 'fasta')
     calculator = DistanceCalculator(matrix)
     dm = calculator.get_distance(aln)

From c260ce4b1fceabf421dbf2c2b459ee2ea92978f7 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Mon, 14 Feb 2022 15:19:06 +0100
Subject: [PATCH 184/192] testing

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 4a05627..664e429 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -313,7 +313,7 @@ def searching_for_db(assembly_path):
     return check
 
 def get_distance_biopython(file, matrix):
-    print(file)
+    #print(file)
     aln = AlignIO.read(open(file), 'fasta')
     calculator = DistanceCalculator(matrix)
     dm = calculator.get_distance(aln)
@@ -637,7 +637,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
     for record in candidates:
         for name in candidate_names:
             if name in record.id:
-                f.write(">" + name + "\n")
+                f.write(">" + record.id + "\n")
                 f.write(str(record.seq) + "\n")
     f.close()
 

From 0ec76787dffb4a5aa6b8ab0304992775f382335d Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Wed, 23 Feb 2022 10:47:17 +0100
Subject: [PATCH 185/192] bug fix, testing

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 664e429..ec41ec2 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -636,7 +636,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
 
     for record in candidates:
         for name in candidate_names:
-            if name in record.id:
+            if name == record.id:
                 f.write(">" + record.id + "\n")
                 f.write(str(record.seq) + "\n")
     f.close()

From 76e503819d7376a59a0a71b8fe9a3c548ad6ecf5 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Thu, 24 Feb 2022 11:04:24 +0100
Subject: [PATCH 186/192] bug fix

---
 fdog/fDOGassembly.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index ec41ec2..0aead0e 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -634,11 +634,14 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
             f.write(str(record.seq) +  "\n")
             break
 
+    already_written = []
     for record in candidates:
         for name in candidate_names:
             if name == record.id:
-                f.write(">" + record.id + "\n")
-                f.write(str(record.seq) + "\n")
+                if name not in already_written:
+                    f.write(">" + record.id + "\n")
+                    f.write(str(record.seq) + "\n")
+                    already_written.append(name)
     f.close()
 
     if msaTool == "muscle":

From ad12f0aaa68e331847b1e4379cb62cae56c2f729 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 11:22:44 +0100
Subject: [PATCH 187/192] gff file positions were corrected during
 fDOG-Assembly run

---
 fdog/fDOGassembly.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 0aead0e..d7a8e37 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -268,6 +268,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug
 
 def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db):
     output = open(candidatesOutFile, "w")
+    region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w")
+    region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n")
 
     for key in regions:
         locations = regions[key]
@@ -279,6 +281,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
             end = str(i[1] + length_extension)
             name = key + "_" + str(counter)
             file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
+            region.write(file + "\t" + str(start) + "\t" + str(end))
             #metaeuk call
             cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " +  tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
             #print(cmd)
@@ -298,6 +301,15 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                     else:
                         output.write(line)
                 sequence_file.close()
+
+                gff_file = open(tmp_path + name + ".gff", "r")
+                    lines = gff_file.readlines()
+                    for line in lines:
+                        values = line.split("\t")
+                        values[3] = int(values[3]) + int(start)
+                        values[4] = int(values[4]) + int(start)
+                        gff_file.write("\t".join(values))
+                gff_file.close()
             except FileNotFoundError:
                 pass
 

From 6b15f26c04e30b3516d2b560527498c255474e74 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 13:56:48 +0100
Subject: [PATCH 188/192] bug fix

---
 fdog/fDOGassembly.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index d7a8e37..e8ed0ee 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -303,12 +303,12 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                 sequence_file.close()
 
                 gff_file = open(tmp_path + name + ".gff", "r")
-                    lines = gff_file.readlines()
-                    for line in lines:
-                        values = line.split("\t")
-                        values[3] = int(values[3]) + int(start)
-                        values[4] = int(values[4]) + int(start)
-                        gff_file.write("\t".join(values))
+                lines = gff_file.readlines()
+                for line in lines:
+                    values = line.split("\t")
+                    values[3] = int(values[3]) + int(start)
+                    values[4] = int(values[4]) + int(start)
+                    gff_file.write("\t".join(values))
                 gff_file.close()
             except FileNotFoundError:
                 pass

From 7d7504f1f76e01a4cd27cad5a371ef3c6cc7bcf4 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 14:18:39 +0100
Subject: [PATCH 189/192] bug fix

---
 fdog/fDOGassembly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index e8ed0ee..051f331 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -306,8 +306,8 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                 lines = gff_file.readlines()
                 for line in lines:
                     values = line.split("\t")
-                    values[3] = int(values[3]) + int(start)
-                    values[4] = int(values[4]) + int(start)
+                    values[3] = str(int(values[3]) + int(start))
+                    values[4] = str(int(values[4]) + int(start))
                     gff_file.write("\t".join(values))
                 gff_file.close()
             except FileNotFoundError:

From 826d676f3846cfa16a6fbba5cdba0d066e158023 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 14:40:52 +0100
Subject: [PATCH 190/192] bug fix

---
 fdog/fDOGassembly.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 051f331..3770e9b 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -306,8 +306,10 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                 lines = gff_file.readlines()
                 for line in lines:
                     values = line.split("\t")
-                    values[3] = str(int(values[3]) + int(start))
-                    values[4] = str(int(values[4]) + int(start))
+                    new_start = int(values[3]) + int(start)
+                    values[3] = str(new_start)
+                    new_end = int(values[4]) + int(start)
+                    values[4] = str(new_end)
                     gff_file.write("\t".join(values))
                 gff_file.close()
             except FileNotFoundError:

From 8a832fc1c67161e9361a94bc29f32d9863e284a0 Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 15:00:56 +0100
Subject: [PATCH 191/192] bug fix

---
 fdog/fDOGassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 3770e9b..2168b5d 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -302,7 +302,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                         output.write(line)
                 sequence_file.close()
 
-                gff_file = open(tmp_path + name + ".gff", "r")
+                gff_file = open(tmp_path + name + ".gff", "r+")
                 lines = gff_file.readlines()
                 for line in lines:
                     values = line.split("\t")

From 14c852c8ed8b53d5f2007820406084ac72908dea Mon Sep 17 00:00:00 2001
From: mueli94 <hannah.muelbaier@gmail.com>
Date: Tue, 1 Mar 2022 15:34:59 +0100
Subject: [PATCH 192/192] bug fix

---
 fdog/fDOGassembly.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index 2168b5d..7027236 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -302,15 +302,18 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group
                         output.write(line)
                 sequence_file.close()
 
-                gff_file = open(tmp_path + name + ".gff", "r+")
+                gff_file = open(tmp_path + name + ".gff", "r")
                 lines = gff_file.readlines()
+                new_lines = []
                 for line in lines:
                     values = line.split("\t")
-                    new_start = int(values[3]) + int(start)
-                    values[3] = str(new_start)
-                    new_end = int(values[4]) + int(start)
-                    values[4] = str(new_end)
-                    gff_file.write("\t".join(values))
+                    values[3] = str(int(values[3]) + int(start))
+                    values[4] = str(int(values[4]) + int(start))
+                    new_lines.append("\t".join(values))
+                gff_file.close()
+                gff_file = open(tmp_path + name + ".gff", "w")
+                for line in new_lines:
+                    gff_file.write(line)
                 gff_file.close()
             except FileNotFoundError:
                 pass