diff --git a/.DS_Store b/.DS_Store
index fa2521e..bcbd073 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/README.md b/README.md
index 52f11e2..8db83ce 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
[![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG)
![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg)
+# Poster fDOG - Assembly
+https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf
# Table of Contents
* [How to install](#how-to-install)
* [Install the fDOG package](#install-the-fdog-package)
diff --git a/fdog/.DS_Store b/fdog/.DS_Store
index f638c26..a99a01c 100644
Binary files a/fdog/.DS_Store and b/fdog/.DS_Store differ
diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl
index b762854..09ec8ec 100755
--- a/fdog/bin/hamstr.pl
+++ b/fdog/bin/hamstr.pl
@@ -200,6 +200,7 @@
######################## start main ###########################################
my $version = "HaMStR v.13.4.6";
+
######################## checking whether the configure script has been run ###
my $configure = 0;
if ($configure == 0){
diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl
index 0352c0d..2fe8333 100755
--- a/fdog/bin/oneSeq.pl
+++ b/fdog/bin/oneSeq.pl
@@ -135,6 +135,7 @@
############ General settings
my $version = 'oneSeq v.2.4.1';
+
##### configure for checking if the setup.sh script already run
my $configure = 0;
if ($configure == 0){
@@ -208,7 +209,6 @@
my $idx_dir = "$path/taxonomy/";
my $dataDir = $path . '/data';
my $weightPath = "$path/weight_dir/";
-my $assembly_dir = "$path/assembly_dir/";
my @defaultRanks = (
'superkingdom', 'kingdom',
@@ -313,15 +313,6 @@
my %hashTree;
my $aln = 'muscle';
my $searchTaxa;
-#variables for fdog_goes_assembly
-my $assembly;
-my $augustusRefSpec;
-my $avIntron;
-my $lengthExtension;
-my $assemblyPath;
-my $searchTool = 'blast';
-my $matrix = 'blosum62';
-my $dataPath = '';
################# Command line options
GetOptions (
"h" => \$help,
@@ -384,15 +375,7 @@
"distDeviation=s" => \$distDeviation,
"aligner=s" => \$aln,
"hyperthread" => \$hyperthread,
- "searchTaxa=s" => \$searchTaxa,
- "assembly" => \$assembly,
- "assemblypath=s" => \$assemblyPath,
- "augustusRefSpec=s" => \$augustusRefSpec,
- "avIntron=s" => \$avIntron,
- "lengthExtension=s" => \$lengthExtension,
- "searchTool=s" => \$searchTool,
- "scoringmatrix=s" => \$matrix,
- "dataPath=s" => \$dataPath
+ "searchTaxa=s" => \$searchTaxa
);
$outputPath = abs_path($outputPath);
@@ -404,8 +387,6 @@
$weightPath = abs_path($weightPath)."/";
$genome_dir = abs_path($genome_dir)."/";
$taxaPath = $genome_dir;
-$dataPath = abs_path($dataPath)."/";
-$assembly_dir = abs_path($assemblyPath)."/";
############# do initial check
if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) {
@@ -415,7 +396,7 @@
initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff);
}
- if (!defined $coreex && !defined $assembly) {
+ if (!defined $coreex) {
if (!grep(/$minDist/, @defaultRanks)) {
die "ERROR: minDist $minDist invalid!\n";
}
@@ -499,7 +480,7 @@
# create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction)
# get annotations for seed sequence if fas support is on
-if ($fas_support && !$assembly){
+if ($fas_support){
if (!$weightPath) {
createWeightFolder();
}
@@ -508,7 +489,7 @@
my $coreStTime = gettime(); #time;
#core-ortholog search
-if (!$coreex && !$assembly) {
+if (!$coreex) {
print "\nCore compiling...\n";
$coremode = 1;
$taxaPath = $blastPath;
@@ -646,12 +627,7 @@
my $final_eval_blast = $eval_blast*$eval_relaxfac;
my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac;
- if (!$assembly){
- $taxaPath = $genome_dir;
- }
- else{
- $taxaPath = $assembly_dir;
- }
+ $taxaPath = $genome_dir;
my @searchTaxa;
unless ($searchTaxa) {
unless($groupNode) {
@@ -741,11 +717,15 @@
if (-e $finalOutput) {
addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
}
+### remove duplicated seq in extended.fa
+if (-e $finalOutput) {
+ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
+}
push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!";
print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n";
-
-if(!$coreOnly && !$assembly){
+## Evaluation of all orthologs that are predicted by the final run
+if(!$coreOnly){
my $fasStTime = gettime();
my $processID = $$;
@@ -757,7 +737,7 @@
addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
# calculate FAS scores for final extended.fa
- if ($fas_support && !$assembly) {
+ if ($fas_support) {
print "Starting the feature architecture similarity score computation...\n";
my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno";
unless ($countercheck) {
@@ -770,21 +750,12 @@
}
push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n";
-
if($autoclean){
print "Cleaning up...\n";
runAutoCleanUp($processID);
}
}
-if ($assembly){
- my $file_assembly_out;
- $file_assembly_out = $outputPath . '/' . $seqName;
- my $cmd_merge;
- $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup";
- printDebug($cmd_merge);
- system($cmd_merge);
-}
## Delete tmp folder
unless ($debug) {
my $delTmp = "rm -rf $tmpdir";
@@ -1194,10 +1165,10 @@ sub checkOptions {
if ($force == 1 and $append ==1) {
$force = 0;
}
- ### check the presence of the pre-computed core set if options reuseCore or assembly is used
- if ($coreex || $assembly) {
+ ### check the presence of the pre-computed core set
+ if ($coreex) {
if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") {
- print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
+ print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n";
exit;
}
}
@@ -1268,7 +1239,7 @@ sub checkOptions {
### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected
$optbreaker = 0;
- while(!$minCoreOrthologs and (!$coreex and !$assembly)) {
+ while(!$minCoreOrthologs and !$coreex) {
if ($optbreaker >= 3){
print "No proper number given ... exiting.\n";
exit;
@@ -1283,12 +1254,10 @@ sub checkOptions {
$filter = 'no' if $filter eq 'F';
}
- if (!$assembly){
- $inputSeq = fetchSequence($seqFile, $dataDir);
- }
+ $inputSeq = fetchSequence($seqFile, $dataDir);
## the user has not provided a sequence id, however, the refspec is determined.
- if($seqId eq '' && !$assembly) {
+ if($seqId eq '') {
my $besthit;
if (!$blast){
## a refspec has been determined
@@ -1398,9 +1367,8 @@ sub checkOptions {
#### checking for the min and max distance for the core set compilation
#### omit this check, if the option reuseCore has been selected (added 2019-02-04)
$optbreaker = 0;
- if (!$coreex and !$assembly) {
+ if (!$coreex) {
my $node;
- #print "Testing coreex assembly\n";
$node = $db->get_taxon(-taxonid => $refTaxa{$refSpec});
$node->name('supplied', $refSpec);
if (lc($maxDist) eq "root"){
@@ -2673,7 +2641,7 @@ sub initialCheck {
}
}
# check weight_dir
- if ($fasoff != 1 && !$assembly) {
+ if ($fasoff != 1) {
my %seen;
my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir);
my @notFolder;
diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py
index b802b26..7027236 100644
--- a/fdog/fDOGassembly.py
+++ b/fdog/fDOGassembly.py
@@ -1,3 +1,21 @@
+# -*- coding: utf-8 -*-
+
+#######################################################################
+# Copyright (C) 2021 Hannah Muelbaier
+#
+# This script is used to run fDOG-Assembly which performs targeted ortholog
+# searches on genome assemblies
+#
+# This script is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for
+# more details
+#
+# Contact: hannah.muelbaier@gmail.com
+#
+#######################################################################
+
############################ imports ###########################################
import os
import os.path
@@ -8,7 +26,31 @@
import argparse
import yaml
import subprocess
+import time
+import shutil
+import multiprocessing as mp
+
########################### functions ##########################################
+def check_path(path):
+ if not os.path.exists(path):
+ print(path + " does not exist. Exciting ...")
+ sys.exit()
+
+def check_ref_sepc(species_list, fasta_file):
+ file = open(fasta_file, "r")
+ lines = file.readlines()
+ species_file = []
+
+ for line in lines:
+ if line[0] == ">":
+ species = line.split("|")[1]
+ species_file.append(species)
+ for species in species_list:
+ if species in species_file:
+ return species
+ print("Reference species is not part of the ortholog group. Exciting ...")
+ sys.exit()
+
def load_config(config_file):
with open(config_file, 'r') as stream:
try:
@@ -16,82 +58,129 @@ def load_config(config_file):
except yaml.YAMLError as exc:
print(exc)
+def starting_subprocess(cmd, mode, time_out = None):
+
+ try:
+ if mode == 'debug':
+ result = subprocess.run(cmd, shell=True, timeout = time_out)
+ elif mode == 'silent':
+ result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out)
+ elif mode == 'normal':
+ result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out)
+ except subprocess.TimeoutExpired:
+ return 1
+
def merge(blast_results, insert_length):
+ #merging overlapping and contigous candidate regions
+ #format dictionary: {node_name: [(,,evalue, ,,, )]}
number_regions = 0
+ insert_length = int(insert_length)
+ score_list = []
for key in blast_results:
locations = blast_results[key]
locations = sorted(locations, key = lambda x: int(x[3]))
- #print("test")
- #print(locations)
size_list = len(locations)
-
j = 0
-
while j < size_list-1:
- i = 1
- while i < size_list-1:
-
- if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])):
- #merge overlapping regions
+ i = j + 1
+ while i < size_list:
+ if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
+ #merge overlapping regions plus strand
locations[j][1] = max(locations[j][1], locations[i][1])
locations[j][2] = min(locations[j][2], locations[i][2])
+ locations[j][4] = max(locations[j][4], locations[i][4])
+ locations[j][6] = max(locations[j][6], locations[i][6])
+ locations.pop(i)
+ size_list -= 1
+ i -= 1
+ elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+ #merge overlapping regions minus strand
+ locations[j][0] = min(locations[j][0], locations[i][0])
+ locations[j][2] = min(locations[j][2], locations[i][2])
+ locations[j][4] = max(locations[j][4], locations[i][4])
+ locations[j][6] = max(locations[j][6], locations[i][6])
locations.pop(i)
size_list -= 1
i -= 1
- elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5])):
- #print(j)
+ elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')):
+ #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand
locations[j][1] = max(locations[j][1], locations[i][1])
locations[j][2] = min(locations[j][2], locations[i][2])
+ locations[j][4] = max(locations[j][4], locations[i][4])
+ locations[j][6] = max(locations[j][6], locations[i][6])
+ locations.pop(i)
+ size_list -= 1
+ i -=1
+ elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')):
+ #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand
+ locations[j][0] = min(locations[j][0], locations[i][0])
+ locations[j][2] = min(locations[j][2], locations[i][2])
+ locations[j][4] = max(locations[j][4], locations[i][4])
+ locations[j][6] = max(locations[j][6], locations[i][6])
locations.pop(i)
size_list -= 1
i -=1
i += 1
j += 1
+ for entry in locations:
+ score_list.append(entry[6])
number_regions += len(locations)
blast_results[key] = locations
- #print(blast_results)
- return blast_results, number_regions
+ return blast_results, number_regions, score_list
-def parse_blast(line, blast_results):
- # format blast line:
- #fomrat dictionary: {node_name: [(,)]}
- #print(line)
+def parse_blast(line, blast_results, cutoff):
+ # format blast line:
+ # format dictionary: {node_name: [(,,evalue, ,,, )]}
line = line.replace("\n", "")
line_info = line.split("\t")
- #print(line_info)
evalue = float(line_info[3])
-
#cut off
- if evalue > 0.00001:
+ if evalue > cutoff:
return blast_results, evalue
#add region to dictionary
else:
- node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5]
+ node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6])
split = node_name.split("|")
-
- # finding out on which strand tBLASTn founded a hit
+ # finding out on which strand tBLASTn found a hit
if sstart < send:
strand = "+"
else:
- sstart = line_info[2]
- send = line_info[1]
+ sstart = int(line_info[2])
+ send = int(line_info[1])
strand = "-"
-
- #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001
+ #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off
if len(split) > 1:
node_name = split[1]
if node_name in blast_results:
list = blast_results[node_name]
- list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand])
+ list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score])
blast_results[node_name] = list
else:
- blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]]
+ blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]]
return blast_results, evalue
-def candidate_regions(intron_length, evalue, tmp_path):
+def get_x_results(blast_dic, x, score_list):
+
+ new_dic = {}
+ score_list.sort(reverse=True)
+ min = score_list[x - 1]
+ number_regions = 0
+
+ for key in blast_dic:
+ key_list = []
+ entries = blast_dic[key]
+ for i in entries:
+ if i[6] >= min:
+ key_list.append(i)
+ if key_list != []:
+ new_dic[key] = key_list
+ number_regions += len(key_list)
+ return new_dic, number_regions
+
+def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10):
###################### extracting candidate regions ########################
# info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
blast_file = open(tmp_path + "/blast_results.out", "r")
@@ -104,67 +193,144 @@ def candidate_regions(intron_length, evalue, tmp_path):
if not line:
break
#parsing blast output
- blast_results, evalue = parse_blast(line, blast_results)
- #evalue cut-off
- if not evalue <= evalue:
- break
+ blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue)
+
if blast_results == {}:
+ blast_file.close()
return 0,0
else:
- candidate_regions, number_regions = merge(blast_results, intron_length)
- #candidate_regions, number_regions = merge_regions(blast_results, cut_off)
- #print(candidate_regions, number_regions)
+ candidate_regions, number_regions, score_list = merge(blast_results, intron_length)
+ blast_file.close()
+ if number_regions > x:
+ candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list)
return candidate_regions, number_regions
-def extract_seq(region_dic, path, tmp_path):
- #print(region_dic)
+def extract_seq(region_dic, path, tmp_path, mode):
+
for key in region_dic:
#print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f")
cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f"
- result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
-
-def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path):
+ starting_subprocess(cmd, mode)
+
+def extract_sequence_from_to(name, file, start, end):
+ #print(name)
+ out = name + ".fasta"
+ if int(start) < 0:
+ start = 0
+ with open(out,"w") as f:
+ for seq_record in SeqIO.parse(file, "fasta"):
+ f.write(">" + str(seq_record.id) + "\n")
+ sequence_length = len(seq_record.seq)
+ if int(end) > sequence_length:
+ end = sequence_length
+ #for testing only
+ #start = 0
+ #end = len(seq_record.seq)
+ f.write(str(seq_record.seq[int(start):int(end)]) + "\n")
+
+ return out, start, end
+
+def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode):
output = open(candidatesOutFile, "w")
for key in regions:
locations = regions[key]
counter = 0
for i in locations:
+ # some variables
counter += 1
start = str(i[0] - length_extension)
end = str(i[1] + length_extension)
name = key + "_" + str(counter)
- #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff")
-
+ # augutus call
cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff"
- result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True)
+ #print(cmd)
+ starting_subprocess(cmd, 'silent')
+ # transfer augustus output to as sequence
cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff"
- result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
+ starting_subprocess(cmd, mode)
+ # parsing header and sequences
+ try:
+ sequence_file = open(tmp_path + name + ".aa", "r")
+ lines = sequence_file.readlines()
+ for line in lines:
+ if line[0] == ">":
+ id = line.replace(">", "")
+ header = ">" + group + "|" + ass_name + "|" + name + "_" + id
+ output.write(header)
+ else:
+ output.write(line)
+ sequence_file.close()
+ except FileNotFoundError:
+ pass
+ #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region")
+ output.close()
- sequence_file = open(tmp_path + name + ".aa", "r")
- lines = sequence_file.readlines()
- for line in lines:
- if line[0] == ">":
- id = line.replace(">", "")
- header = ">" + group + "|" + ass_name + "|" + name + "_" + id
- output.write(header)
- else:
- output.write(line)
- sequence_file.close()
+def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db):
+ output = open(candidatesOutFile, "w")
+ region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w")
+ region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n")
+
+ for key in regions:
+ locations = regions[key]
+ counter = 0
+ for i in locations:
+ #some variables
+ counter += 1
+ start = str(i[0] - length_extension)
+ end = str(i[1] + length_extension)
+ name = key + "_" + str(counter)
+ file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end)
+ region.write(file + "\t" + str(start) + "\t" + str(end))
+ #metaeuk call
+ cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1"
+ #print(cmd)
+ # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1
+ starting_subprocess(cmd, mode)
+ # parsing header and sequences
+ try:
+ sequence_file = open(tmp_path + name + ".fas", "r")
+ lines = sequence_file.readlines()
+ #print(lines)
+ id = 0
+ for line in lines:
+ if line[0] == ">":
+ id += 1
+ header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n"
+ output.write(header)
+ else:
+ output.write(line)
+ sequence_file.close()
+
+ gff_file = open(tmp_path + name + ".gff", "r")
+ lines = gff_file.readlines()
+ new_lines = []
+ for line in lines:
+ values = line.split("\t")
+ values[3] = str(int(values[3]) + int(start))
+ values[4] = str(int(values[4]) + int(start))
+ new_lines.append("\t".join(values))
+ gff_file.close()
+ gff_file = open(tmp_path + name + ".gff", "w")
+ for line in new_lines:
+ gff_file.write(line)
+ gff_file.close()
+ except FileNotFoundError:
+ pass
output.close()
def searching_for_db(assembly_path):
- #print("test: " + str(assembly_path) + "\n")
+
db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto']
check = True
for end in db_endings:
- #print(assembly_path + end + "\n")
- check = check and os.path.exists(assembly_path + end)
- #print(check)
+ if not any(File.endswith(end) for File in os.listdir(assembly_path)):
+ check = False
return check
def get_distance_biopython(file, matrix):
+ #print(file)
aln = AlignIO.read(open(file), 'fasta')
calculator = DistanceCalculator(matrix)
dm = calculator.get_distance(aln)
@@ -216,16 +382,28 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
if msaTool == "muscle":
os.system("muscle -quiet -in " + output_file + " -out " + aln_file)
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
+ if not os.path.exists(aln_file):
+ print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.")
+ os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
+
elif msaTool == "mafft-linsi":
#print("mafft-linsi")
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
- #d_ref = get_distance(aln_file, best_hit, ref)
- #d = get_distance(aln_file, best_hit, candidate_name)
- distances = get_distance_biopython(aln_file, matrix)
+ try:
+ distances = get_distance_biopython(aln_file, matrix)
+ distance_hit_query = distances[best_hit, candidate_name]
+ distance_ref_hit = distances[best_hit, ref]
+ #print(distances)
+ except ValueError:
+ pass
+ #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name)
+ return 0, "NaN", "NaN"
- distance_hit_query = distances[best_hit, candidate_name]
- distance_ref_hit = distances[best_hit, ref]
+
+
+ #distance_hit_query = distances[best_hit, candidate_name]
+ #distance_ref_hit = distances[best_hit, ref]
if distance_ref_hit < distance_hit_query:
#accepted
@@ -235,7 +413,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
#rejected
return 0, distance_ref_hit, distance_hit_query
-def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path):
+def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode):
# the backward search uses the genes predicted from augustus and makes a blastp search
#the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa
seedDic = getSeedInfo(fasta_path)
@@ -248,10 +426,11 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
try:
id_ref = seedDic[fdog_ref_species]
except KeyError:
- print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
+ #print("The fDOG reference species isn't part of the core ortholog group, ... exciting")
return 0, seed
if searchTool == "blast":
- os.system("blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile)
+ cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+ starting_subprocess(cmd, mode)
else:
print("diamonds are the girls best friends")
##### diamond call
@@ -265,45 +444,46 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
id, gene, evalue = (line.replace("\n", "")).split("\t")
gene_name = gene.split("|")[2]
if gene_name != old_name:
- print("candidate:%s"%(gene_name))
- print("blast-hit:%s"%(id))
+ print("candidate:%s"%(gene_name)) if mode == "debug" else ""
+ print("blast-hit:%s"%(id)) if mode == "debug" else ""
min = float(evalue)
if id in id_ref:
orthologs.append(gene)
- print("\thitting\n")
+ print("\thitting\n") if mode == "debug" else ""
else:
if checkCo == True:
for i in id_ref:
- print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+ print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
if co_orthologs_result == 1:
- print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+ print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
orthologs.append(gene)
elif co_orthologs_result == 0:
- print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+ if distance_ref_hit != "NaN":
+ print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
else:
- print("\tnothitting\n")
+ print("\tnothitting\n") if mode == "debug" else ""
elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs:
if id in id_ref:
orthologs.append(gene)
- print("\thitting\n")
+ print("\thitting\n") if mode == "debug" else ""
else:
if checkCo == True:
for i in id_ref:
- print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i))
+ print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else ""
co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path)
if co_orthologs_result == 1:
- print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit))
+ print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
orthologs.append(gene)
elif co_orthologs_result == 0:
- print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit))
+ print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else ""
else:
- print("\tnot hitting\n")
+ print("\tnot hitting\n") if mode == "debug" else ""
old_name = gene_name
if orthologs == []:
- print("No hit in the backward search, ...exciting")
+ #print("No hit in the backward search, ...exciting")
return 0, seed
else:
@@ -328,15 +508,16 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
orthologs = set({})
for species in seed:
- print("backward search in species " + species + "\n")
+ print("backward search in species %s\n" %species)
orthologs_new = set({})
try:
id_ref = seedDic[species]
except KeyError:
- print("The species " + species + " isn't part of the core ortholog group, ... exciting")
+ #print("The species " + species + " isn't part of the core ortholog group, ... exciting")
return 0, seed
- os.system("blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile)
+ cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile
+ starting_subprocess(cmd, mode)
alg_file = open(tmp_path + "/blast_" + species, "r")
lines = alg_file.readlines()
alg_file.close()
@@ -355,23 +536,54 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
#print(species)
#print(orthologs_new)
+ #print(orthologs)
if species == fdog_ref_species:
orthologs = orthologs_new
else:
orthologs = orthologs & orthologs_new
- if orthologs == {}:
- print("No ortholog was found with option --strict")
+ if len(orthologs) == 0:
+ #print("No ortholog was found with option --strict")
return 0, seed
-
-
#print(orthologs)
+ orthologs = set(orthologs)
return list(orthologs), seed
-def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
- #print("addSequences")
- #print(sequenceIds)
+def addRef(output, core_fasta, species_list):
#print(species_list)
+ output_file = open(output, "a+")
+ seq_records_core = readFasta(core_fasta)
+ seq_records_core = list(seq_records_core)
+ for species in species_list:
+ for entry_core in seq_records_core:
+ if species in entry_core.id:
+ output_file.write(">" + entry_core.id + "\n")
+ output_file.write(str(entry_core.seq) + "\n")
+ output_file.close()
+
+def addSeq(output, seq_list):
+ output_file = open(output, "a+")
+
+ for item in seq_list:
+ #print(item)
+ candidate_fasta = item[1]
+ sequenceIds = item[0]
+ if sequenceIds == 0 or sequenceIds == []:
+ continue
+ seq_records_candidate = readFasta(candidate_fasta)
+ seq_records_candidate = list(seq_records_candidate)
+ for entry_candidate in seq_records_candidate:
+ if entry_candidate.id in sequenceIds:
+ if entry_candidate.id == sequenceIds[0]:
+ output_file.write(">" + entry_candidate.id + "|1" + "\n")
+ output_file.write(str(entry_candidate.seq) + "\n")
+ else:
+ output_file.write(">" + entry_candidate.id + "|0" + "\n")
+ output_file.write(str(entry_candidate.seq) + "\n")
+ output_file.close()
+
+def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path):
+
output_file = open(output, "a+")
if refBool == False:
seq_records_core = readFasta(core_fasta)
@@ -382,20 +594,24 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species
output_file.write(">" + entry_core.id + "\n")
output_file.write(str(entry_core.seq) + "\n")
- seq_records_candidate = readFasta(candidate_fasta)
- seq_records_candidate = list(seq_records_candidate)
- for entry_candidate in seq_records_candidate:
- #print(entry_candidate.id)
- #print(sequenceIds)
- if entry_candidate.id in sequenceIds:
- output_file.write(">" + entry_candidate.id + "\n")
- output_file.write(str(entry_candidate.seq) + "\n")
+ if sequenceIds != 0:
+ seq_records_candidate = readFasta(candidate_fasta)
+ seq_records_candidate = list(seq_records_candidate)
+ for entry_candidate in seq_records_candidate:
+ if entry_candidate.id in sequenceIds:
+ if entry_candidate.id == sequenceIds[0]:
+ output_file.write(">" + entry_candidate.id + "|1" + "\n")
+ output_file.write(str(entry_candidate.seq) + "\n")
+ else:
+ output_file.write(">" + entry_candidate.id + "|0" + "\n")
+ output_file.write(str(entry_candidate.seq) + "\n")
output_file.close()
return 0
def createFasInput(orthologsOutFile, mappingFile):
with open(orthologsOutFile, "r") as f:
fas_seed_id = (f.readline())[1:-1]
+ #fas_seed_id = fas_seed_id.split("|")[0]
mappingFile = open(mappingFile, "a+")
@@ -404,18 +620,22 @@ def createFasInput(orthologsOutFile, mappingFile):
ncbi_id = (seq.id.split("@"))[1]
mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n")
-
+ mappingFile.close()
return fas_seed_id
def cleanup(tmp, tmp_path):
if tmp == False:
- os.system('rm -r ' + tmp_path)
-
-def checkOptions():
- pass
- #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!!
+ timeout = time.time() + 60*1
+ while os.path.exists(tmp_path):
+ shutil.rmtree(tmp_path, ignore_errors=True)
+ if time.time() > timeout:
+ print("tmp folder could not be removed!")
+ break
def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
+ if len(candidate_names) == 1:
+ return candidate_names
+
candidates = readFasta(candidatesFile)
ref = readFasta(fasta)
@@ -431,18 +651,19 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
f.write(str(record.seq) + "\n")
break
+ already_written = []
for record in candidates:
for name in candidate_names:
- if name in record.id:
- f.write(">" + name + "\n")
- f.write(str(record.seq) + "\n")
+ if name == record.id:
+ if name not in already_written:
+ f.write(">" + record.id + "\n")
+ f.write(str(record.seq) + "\n")
+ already_written.append(name)
f.close()
if msaTool == "muscle":
os.system("muscle -quiet -in " + out + " -out " + aln_file)
- #print("muscle -quiet -in " + output_file + " -out " + aln_file)
elif msaTool == "mafft-linsi":
- #print("mafft-linsi")
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file)
distances = get_distance_biopython(aln_file, matrix)
@@ -452,19 +673,160 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
for name in candidate_names:
distance = distances[ref_id , name]
- if distance < min_dist:
+ if distance <= min_dist:
min_dist = distance
min_name = name
- checked = []
-
+ checked = [min_name]
for name in candidate_names:
- if distances[min_name , name] < distances[min_name , ref_id]:
+ if name == min_name:
+ pass
+ elif distances[min_name , name] <= distances[min_name , ref_id]:
checked.append(name)
return checked
+def clean_fas(path, file_type):
+ file = open(path, "r")
+ lines = file.readlines()
+ file.close()
+ file = open(path,"w")
+
+ for line in lines:
+ if file_type == 'domains':
+ long_id, remain = line.split("#")
+ id = long_id.split("|")[0]
+ new_line = id + "#" + remain
+ else:
+ long_id, remain = line.split("\t", 1)
+ id = long_id.split("|")[0]
+ new_line = id + "\t" + remain
+
+ file.write(new_line)
+ file.close()
+
+def ortholog_search_tblastn(args):
+ (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args
+ output = []
+ cmd = 'mkdir ' + out + '/tmp/' + asName
+ starting_subprocess(cmd, 'silent')
+ tmp_path = out + "tmp/" + asName + "/"
+ candidatesOutFile = tmp_path + group + ".candidates.fa"
+ #orthologsOutFile = out + "/" + group + ".extended.fa"
+ fasOutFile = out + "/" + group
+ #mappingFile = out + "/tmp/" + group + ".mapping.txt"
+
+ output.append("Searching in species " + asName + "\n")
+ assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
+ db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
+ blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/"
+ db_check = searching_for_db(blast_dir_path)
+
+ if db_check == 0:
+ cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path
+ starting_subprocess(cmd, mode)
+
+ #makes a tBLASTn search against database
+ #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+ cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out'
+ time_tblastn_start = time.time()
+ exit_code = starting_subprocess(cmd, mode, 3600)
+ time_tblastn_end = time.time()
+ time_tblastn = time_tblastn_end - time_tblastn_start
+ if exit_code == 1:
+ output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName)
+ return [], candidatesOutFile, output
+
+ output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName))
+
+ regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+ if regions == 0:
+ #no candidat region are available, no ortholog can be found
+ output.append("No candidate region found for species %s!\n" % asName)
+ return [], candidatesOutFile, output
+
+ else:
+ output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName)
+ extract_seq(regions, db_path, tmp_path, mode)
+
+
+ if gene_prediction == "augustus":
+ ############### make Augustus PPX search ###################################
+ time_augustus_start = time.time()
+ augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode)
+ time_augustus_end = time.time()
+ time_augustus = time_augustus_end - time_augustus_start
+ output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName))
+ else:
+ time_metaeuk_start = time.time()
+ if metaeuk_db == '':
+ db = fasta_path
+ else:
+ db = metaeuk_db
+ metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db)
+ time_metaeuk_end = time.time()
+ time_metaeuk = time_metaeuk_end - time_metaeuk_start
+ output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))
+
+ ################# backward search to filter for orthologs###################
+ if int(os.path.getsize(candidatesOutFile)) <= 0:
+ #print("No genes found at candidate regions\n")
+ return [], candidatesOutFile, output
+
+ reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode)
+
+ if reciprocal_sequences == 0:
+ if regions != 0:
+ output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName)
+ return [], candidatesOutFile, output
+ else:
+ reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+
+ return reciprocal_sequences, candidatesOutFile, output
+
+def blockProfiles(core_path, group, mode, out):
+
+ ######################## paths ################################
+ msa_path = core_path + "/" + group +"/"+ group + ".aln"
+ check_path(msa_path)
+ profile_path = out + "/tmp/" + group + ".prfl"
+
+ ######################## block profile #####################################
+
+ print("Building a block profile ...")
+ cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
+ starting_subprocess(cmd, 'silent')
+
+ if int(os.path.getsize(profile_path)) > 0:
+ print("\t ...finished \n")
+ else:
+ print("Building block profiles failed. Using prepareAlign to convert alignment\n")
+ new_path = core_path + group +"/"+ group + "_new.aln"
+ cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
+ starting_subprocess(cmd, mode)
+ cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
+ starting_subprocess(cmd, 'silent')
+ print(" \t ...finished \n")
+
+ return profile_path
+
+def consensusSequence(core_path, group, mode, out):
+
+ ######################## paths ################################
+ hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
+ check_path(hmm_path)
+ consensus_path = out + "/tmp/" + group + ".con"
+
+ ######################## consensus sequence ################################
+ #make a majority-rule consensus sequence with the tool hmmemit from hmmer
+ print("Building a consensus sequence")
+ cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path
+ starting_subprocess(cmd, mode)
+ print("\t ...finished\n")
+
+ return consensus_path
+
class Logger(object):
def __init__(self, file):
self.file = file
@@ -478,24 +840,23 @@ def write(self, message):
def flush(self):
pass
-
def main():
- #################### handle user input ########################################
-
- version = '0.0.1'
+ #################### handle user input #####################################
+ start = time.time()
+ version = '0.1.3'
+ ################### initialize parser ######################################
parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.')
parser.add_argument('--version', action='version', version=str(version))
-
+ ################## required arguments ######################################
required = parser.add_argument_group('Required arguments')
required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/',
action='store', default='', required=True)
- required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True)
- required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True)
-
+ required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True)
+ ################## optional arguments ######################################
optional = parser.add_argument_group('Optional arguments')
- optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int)
+ optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int)
optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int)
optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='')
optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False)
@@ -508,34 +869,34 @@ def main():
optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle')
optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False)
optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
- optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='')
- optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
+ optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[])
+ #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no')
optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False)
optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
- optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='')
+ optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[])
optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False)
-
+ optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False)
+ optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False)
+ optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
+ optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False)
+ optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False)
+ optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
+ optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='')
args = parser.parse_args()
# required
group = args.gene
- augustus_ref_species = args.augustusRefSpec
fdog_ref_species = args.refSpec
#paths user input
assemblyDir = args.assemblyPath
dataPath = args.dataPath
core_path = args.coregroupPath
- out = args.out + "/"
+ out = args.out
pathFile = args.pathFile
#I/O
tmp = args.tmp
strict = args.strict
checkCoorthologs = args.checkCoorthologsRef
- filter = args.filter
- if filter == True or filter == 'yes':
- filter = 'yes'
- else:
- filter = 'no'
#others
average_intron_length = args.avIntron
length_extension = args.lengthExtension
@@ -544,35 +905,38 @@ def main():
msaTool = args.msaTool
matrix = args.scoringmatrix
taxa = args.coreTaxa
- if taxa == '':
- taxa =[]
- else:
- taxa = taxa.split(",")
fasoff = args.fasoff
- searchTaxon = args.searchTaxon
+ searchTaxa = args.searchTaxa
silent = args.silent
+ debug = args.debug
+ force = args.force
+ append = args.append
+ parallel = args.parallel
+ augustus_ref_species = args.augustusRefSpec
+ metaeuk_db = args.metaeukDb
- ###################### How to handling std output ##########################
- # if silent == True:
- # print(out + "fdog.log \n")
- # f = open(out + "fdog.log", "a+")
- # sys.stdout = f
- # else:
- # print(out + "fdog.log \n")
- # sys.stdout = Logger(out)
-
- try:
- f = open(out + "fdog.log", "a+")
- except FileNotFoundError:
- f = open(out + "fdog.log", "w")
-
+ #gene prediction tool
+ augustus = args.augustus
+ if augustus == True:
- if silent == True:
- sys.stderr = f
- sys.stdout = f
+ if augustus_ref_species == '':
+ print("Augustus reference species is required when using Augustus as gene prediction tool")
+ return 1
+ gene_prediction = "augustus"
else:
- sys.stdout = Logger(f)
+ gene_prediction = "metaeuk"
+ # output modes
+ if debug == True and silent == True:
+ print("It's not possible to use booth modes, please restart and use --debug or --silent")
+ return 1
+ else:
+ if debug == True:
+ mode = 'debug'
+ elif silent == True:
+ mode = 'silent'
+ else:
+ mode = 'normal'
#checking paths
if dataPath == '':
@@ -590,178 +954,183 @@ def main():
except:
dataPath = 'config'
- if assemblyDir == '':
- assemblyDir = dataPath + '/assembly_dir/'
if out == '':
out = os.getcwd()
- if core_path == '':
- core_path = out + '/core_orthologs/'
-
-
- # user input has to be checked here before fDOGassembly continues
-
- assembly_names = os.listdir(assemblyDir)
-
-
-
- ########################## some variables ##################################
-
- refBool = False # checks if sequences of reference species were already part of the extended.fa file
- ########### paths ###########
-
- msa_path = core_path + "/" + group +"/"+ group + ".aln"
- hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm"
- fasta_path = core_path + "/" + group +"/"+ group + ".fa"
- consensus_path = out + "/tmp/" + group + ".con"
- profile_path = out + "/tmp/" + group + ".prfl"
-
- ###################### create tmp folder ###################################
-
- os.system('mkdir ' + out + '/tmp')
-
- ######################## consensus sequence ################################
-
- #make a majority-rule consensus sequence with the tool hmmemit from hmmer
- print("Building a consensus sequence \n")
- os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path)
- print("consensus sequence is finished\n")
-
- ######################## block profile #####################################
-
- print("Building a block profile \n")
- cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
- #os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path)
- result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
-
- #print(os.path.getsize(profile_path))
- if int(os.path.getsize(profile_path)) > 0:
- print("block profile is finished \n")
else:
- print("Building block profiles failed. Using prepareAlign to convert alignment\n")
- new_path = core_path + group +"/"+ group + "_new.aln"
- cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path
- result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
- cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
- result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True)
- print("block profile is finished \n")
-
-
- searchBool = False
-
- for asName in assembly_names:
- if searchBool == True:
- break
- if searchTaxon != '' and searchBool == False:
- asName = searchTaxon
- searchBool = True
-
- ################### path definitions ###################################
- os.system('mkdir ' + out + '/tmp/' + asName)
- tmp_path = out + "/tmp/" + asName + "/"
- candidatesOutFile = tmp_path + group + ".candidates.fa"
- if searchTaxon != '':
- orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa"
- fasOutFile = out + "/" + group + "_" + asName
- mappingFile = tmp_path + group + "_" + asName + ".mapping.txt"
+ if out[-1] != "/":
+ out = out + "/"
+ check_path(out)
+
+ if os.path.exists(out + '/' + group):
+ if append != True and force != True:
+ print("Output folder for group " + group + " exists already. Please choose --force or --append.")
+ sys.exit()
+ elif force == True:
+ shutil.rmtree(out + '/' + group, ignore_errors=True)
+ refBool = False
+ os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+ out = out + '/' + group + '/'
+ elif append == True:
+ out = out + '/' + group + '/'
+ refBool = True
else:
- orthologsOutFile = out + "/" + group + ".extended.fa"
- fasOutFile = out + "/" + group
- mappingFile = out + "/tmp/" + group + ".mapping.txt"
-
-
- print("Searching in species " + asName + "\n")
- assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa"
- db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa"
- ######################## tBLASTn ###########################################
-
- #database anlegen
+ refBool = False # checks if sequences of reference species were already part of the extended.fa file
+ else:
+ os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1')
+ out = out + '/' + group + '/'
+ refBool = False
- db_check = searching_for_db(db_path)
- #print(assembly_path)
- if db_check == 0:
- print("creating a blast data base \n")
- os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path)
- print("database is finished \n")
- else:
- print('blast data base exists already, continuing...')
+ if core_path == '':
+ core_path = out + '/core_orthologs/'
+ else:
+ if not core_path.endswith('/'):
+ core_path = core_path + '/'
+ check_path(core_path)
+ if assemblyDir == '':
+ assemblyDir = dataPath + '/assembly_dir/'
+ check_path(assemblyDir)
- #make a tBLASTn search against the new database
- #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+ if metaeuk_db != '':
+ check_path(metaeuk_db)
- print("tBLASTn search against data base")
- os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -out ' + tmp_path + '/blast_results.out')
- print("tBLASTn search is finished")
- ################### search for candidate regions and extract seq ###########
+ try:
+ f = open(out + "/fdog.log", "a+")
+ except FileNotFoundError:
+ f = open(out + "/fdog.log", "w")
- # parse blast and filter for candiate regions
- regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path)
+ ################## How to handle std output and std error ##################
- if regions == 0:
- #no candidat region are available, no ortholog can be found
- print("No candidate region found")
- continue
+ if mode == 'silent':
+ sys.stderr = f
+ sys.stdout = f
+ else:
+ sys.stdout = Logger(f)
+ ########################### other variables ################################
+ if searchTaxa == []:
+ assembly_names = os.listdir(assemblyDir)
+ else:
+ if len(searchTaxa) > 1:
+ assembly_names = os.listdir(assemblyDir)
+ for Taxon in searchTaxa:
+ if Taxon not in assembly_names:
+ print("Taxon %s is not in the assembly_dir" % Taxon)
+ sys.exit()
+ assembly_names = searchTaxa
else:
- print(str(number_regions) + " candiate regions were found. Extracting sequences...")
- extract_seq(regions, db_path, tmp_path)
-
- ############### make Augustus PPX search ###################################
- print("starting augustus ppx \n")
- augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path)
- print("augustus is finished \n")
+ if searchTaxa[0] in os.listdir(assemblyDir):
+ assembly_names = searchTaxa
+ elif os.path.isfile(searchTaxa[0]):
+ with open(searchTaxa[0]) as file:
+ lines = file.readlines()
+ assembly_names = [line.rstrip() for line in lines]
+ else:
+ print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0])
- ################# backward search to filter for orthologs###################
- reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path)
+ ################################# paths ####################################
+ fasta_path = core_path + "/" + group +"/"+ group + ".fa"
+ check_path(fasta_path)
+ tmp_folder = out + "/tmp"
- if reciprocal_sequences == 0:
- print("No ortholog fulfilled the reciprocity criteria")
- if searchTaxon == '':
- continue
- else:
- cleanup(tmp, tmp_path)
- return 1
+ ########### is/are fDOG reference species part of ortholog group? ##########
- ################## checking accepted genes for co-orthologs ##########################
- print(reciprocal_sequences)
- reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix)
+ fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path)
+ ###################### create tmp folder ###################################
+ cmd = 'mkdir ' + out + '/tmp'
+ starting_subprocess(cmd, 'silent')
- ################ add sequences to extended.fa in the output folder##########
- addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path)
- refBool = True
+ print("Gene: " + group)
+ print("fDOG reference species: " + fdog_ref_species + " \n")
- ############### make Annotation with FAS ###################################
- if searchTaxon != '' and fasoff == False:
- fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
- # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
- os.system('mkdir ' + tmp_path + 'anno_dir')
- os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName )
+ ###################### preparations ########################################
+ if augustus == True:
+ group_computation_time_start = time.time()
+ consensus_path = consensusSequence(core_path, group, mode, out)
+ profile_path = blockProfiles(core_path, group, mode, out)
+ group_computation_time_end = time.time()
+ time_group = group_computation_time_end - group_computation_time_start
+ else:
+ #print("test")
+ profile_path = ""
+ group_computation_time_start = time.time()
+ consensus_path = consensusSequence(core_path, group, mode, out)
+ #concatinade core_group sequences if metaeuk should be run without tblastn
+ group_computation_time_end = time.time()
+ time_group = group_computation_time_end - group_computation_time_start
+
+
+ ###################### ortholog search #####################################
+
+ ortholog_sequences = []
+ time_ortholog_start = time.time()
+
+ if parallel == True:
+ ##################### parallel computation #############################
+ calls = []
+ cpus = mp.cpu_count()
+ pool = mp.Pool(cpus)
+ for asName in assembly_names:
+ calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db])
+
+ results = (pool.imap_unordered(ortholog_search_tblastn, calls))
+ pool.close()
+ pool.join()
+ for i in results:
+ ortholog_sequences.append([i[0], i[1]])
+ for k in i[2]:
+ print(k)
+ else:
+ ###################### computation species wise ################
+ for asName in assembly_names:
+ args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]
+ reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args)
+ ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
+ for k in output_ortholog_search:
+ print(k)
+
+ time_ortholog_end = time.time()
+ time_ortholog = time_ortholog_end - time_ortholog_start
+
+ ################## preparing output ########################################
+ orthologsOutFile = out + "/" + group + ".extended.fa"
+
+ if taxa == []:
+ taxa = [fdog_ref_species]
+ if append == True:
+ addSeq(orthologsOutFile, ortholog_sequences)
+ else:
+ addRef(orthologsOutFile, fasta_path, taxa)
+ addSeq(orthologsOutFile, ortholog_sequences)
+ mappingFile = out + "/tmp/" + group + ".mapping.txt"
- if refBool == False and searchTaxon == '':
- print("No orthologs found. Exciting ...")
- cleanup(tmp, tmp_path)
- return 1
+ if fasoff == False:
+ fas = time.time()
+ print("Calculating FAS scores ...")
- if fasoff == False and searchTaxon == '':
tmp_path = out + '/tmp/'
fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
- # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option
- os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group )
-
+ cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
+ starting_subprocess(cmd, 'silent')
+ clean_fas(out + group + "_forward.domains", 'domains')
+ clean_fas(out + group + "_reverse.domains", 'domains')
+ clean_fas(out + group + ".phyloprofile", 'phyloprofile')
+ print("\t ...finished \n")
################# remove tmp folder ########################################
- if searchTaxon != '':
- cleanup(tmp, tmp_path)
- else:
- cleanup(tmp, out + "/tmp/")
+ end = time.time()
+ time_fas = end - fas
+ print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.")
+ print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas)))
+ sys.stdout = sys.__stdout__
f.close()
-
+ cleanup(tmp, tmp_folder)
if __name__ == '__main__':
main()
diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py
deleted file mode 100644
index ea6e084..0000000
--- a/fdog/mergeAssemblyOutput.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# -*- coding: utf-8 -*-
-
-#######################################################################
-# Copyright (C) 2020 Vinh Tran
-#
-# This script is used to merge all output files (.extended.fa, .phyloprofile,
-# _forward.domains, _reverse.domains) in a given directory into one file each.
-#
-# This script is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for
-# more details
-#
-# Contact: hannah.muelbaier@stud.uni-frankfurt.de
-#
-#######################################################################
-
-import sys
-import os
-from os import listdir as ldir
-import argparse
-from pathlib import Path
-
-def main():
- version = '0.0.1'
- parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.')
- parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
- action='store', default='', required=True)
- parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True)
- parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False)
- args = parser.parse_args()
-
- directory = args.input
- out = args.output
- cleanup = args.cleanup
- if not os.path.exists(os.path.abspath(directory)):
- sys.exit('%s not found' % directory)
- else:
- directory = os.path.abspath(directory)
-
- phyloprofile = None
- set_phylo = set()
- domains_0 = None
- set_domains_f = set()
- domains_1 = None
- set_domains_r = set()
- ex_fasta = None
- set_fasta = set()
- header_bool = False
- for infile in ldir(directory):
- if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile':
- if not phyloprofile:
- phyloprofile = open(out + '.phyloprofile', 'w')
- phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n')
- with open(directory + '/' + infile, 'r') as reader:
- lines = reader.readlines()
- for line in lines:
- if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo:
- phyloprofile.write(line)
- if len(lines) > 1:
- set_phylo = set(lines)
- if cleanup == True:
- os.remove(directory + '/' + infile)
- elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains':
- if not domains_0:
- domains_0 = open(out + '_forward.domains', 'w')
- with open(directory + '/' + infile, 'r') as reader:
- lines = reader.readlines()
- for line in lines:
- if line not in set_domains_f:
- domains_0.write(line)
- if len(lines) > 1:
- set_domains_f = set(lines)
- if cleanup == True:
- os.remove(directory + '/' + infile)
- elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains':
- if not domains_1:
- domains_1 = open(out + '_reverse.domains', 'w')
- with open(directory + '/' + infile, 'r') as reader:
- lines = reader.readlines()
- for line in lines:
- if line not in set_domains_r:
- domains_1.write(line)
- if len(lines) > 1:
- set_domains_r = set(lines)
- if cleanup == True:
- os.remove(directory + '/' + infile)
- elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa':
- if not ex_fasta:
- ex_fasta = open(out + '.extended.fa', 'w')
- with open(directory + '/' + infile, 'r') as reader:
- lines = reader.readlines()
- header = set()
- #print(set_fasta)
- for line in lines:
- if line[0] == ">":
- header.add(line)
- if line not in set_fasta:
- ex_fasta.write(line)
- header_bool = True
- else:
- header_bool = False
- else:
- if header_bool == True:
- ex_fasta.write(line)
- set_fasta = header
- if cleanup == True:
- os.remove(directory + '/' +infile)
-
- if phyloprofile:
- phyloprofile.close()
- if domains_0:
- domains_0.close()
- if domains_1:
- domains_1.close()
- if ex_fasta:
- ex_fasta.close()
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/fdog/runMulti.py b/fdog/runMulti.py
index 7d73cdd..be552a7 100644
--- a/fdog/runMulti.py
+++ b/fdog/runMulti.py
@@ -107,10 +107,9 @@ def compileCore(options, seeds, inFol, cpu, outpath):
for seed in seeds:
seqFile = [inFol + '/' + seed]
seqName = getSeedName(seed)
-
if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)):
(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core')
- coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute])
+ coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute])
if len(coreCompilationJobs) > 0:
pool = mp.Pool(cpu)
coreOut = []
@@ -132,7 +131,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath):
for seed in seeds:
seqFile = [inFol + '/' + seed]
seqName = getSeedName(seed)
- (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
+ (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog')
if mute == True:
print(seed)
else:
@@ -331,14 +330,6 @@ def main():
optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
- assembly_options = parser.add_argument_group('Assembly options')
- assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
- assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
- assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
- assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
- assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
- assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
- assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
### get arguments
args = parser.parse_args()
@@ -420,15 +411,6 @@ def main():
silent = False
else:
silent = True
-
- #fdog_goes_assembly arguments
- assembly = args.assembly
- assemblyFile = args.assemblyFile
- augustusRefSpec = args.augustusRefSpec
- avIntron = args.avIntron
- lengthExtension = args.lengthExtension
- searchTool = args.searchTool
- matrix = args.scoringmatrix
### check fas
if not fasoff:
diff --git a/fdog/runSingle.py b/fdog/runSingle.py
index 7e0f858..f239f90 100644
--- a/fdog/runSingle.py
+++ b/fdog/runSingle.py
@@ -67,13 +67,13 @@ def getfdogInfo(fdogPath, infoType):
exit('%s not found' % (fdogPath + '/bin/oneSeq.pl'))
def runSingle(args):
- (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args
+ (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args
# basic command
(fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs
cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec)
# add paths
- (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs
- cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath)
+ (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs
+ cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath)
# add other I/O options
(append, force, noCleanup, group, blast, db) = ioArgs
if append == True:
@@ -165,28 +165,7 @@ def runSingle(args):
cmd = cmd + ' -debug'
if silent == True:
cmd = cmd + ' -silent'
- # add assembly options
- (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs
- if assembly == True:
- cmd = cmd + ' -assembly'
- cmd = cmd + ' -reuseCore'
- if not augustusRefSpec == '':
- cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec
- else:
- sys.exit('An augutus reference species is requiered by using the option --assembly')
- if not avIntron == '':
- cmd = cmd + ' -avIntron=%s' % avIntron
- if not lengthExtension == '':
- cmd = cmd + ' -lengthExtension=%s' % lengthExtension
- if not assemblyFile == '':
- cmd = cmd + ' -assemblyFile=%s' % assemblyFile
- if not searchTool == '':
- cmd = cmd + ' -searchTool=%s' % searchTool
- if not matrix == '':
- cmd = cmd + ' -scoringmatrix=%s' % matrix
- if not dataPath == '':
- cmd = cmd + ' -dataPath=%s' % dataPath
- #print(cmd)
+ # print(cmd)
if mute == True:
cmd = cmd + ' > /dev/null 2>&1'
try:
@@ -238,8 +217,6 @@ def main():
optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='')
optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='')
optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='')
- optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='')
-
addtionalIO = parser.add_argument_group('Other I/O options')
addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False)
@@ -326,14 +303,6 @@ def main():
optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False)
optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False)
- assembly_options = parser.add_argument_group('Assembly options')
- assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False)
- assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='')
- assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='')
- assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int)
- assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int)
- assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast')
- assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62')
### get arguments
args = parser.parse_args()
@@ -353,7 +322,6 @@ def main():
searchpath = args.searchpath
weightpath = args.weightpath
pathFile = args.pathFile
- assemblypath = args.assemblypath
# other I/O arguments
append = args.append
@@ -415,15 +383,6 @@ def main():
else:
silent = True
- #fdog_goes_assembly arguments
- assembly = args.assembly
- assemblyFile = args.assemblyFile
- augustusRefSpec = args.augustusRefSpec
- avIntron = args.avIntron
- lengthExtension = args.lengthExtension
- searchTool = args.searchTool
- matrix = args.scoringmatrix
-
### get fdog and data path
dataPath = ''
fdogPath = os.path.realpath(__file__).replace('/runSingle.py','')
@@ -471,29 +430,19 @@ def main():
except:
sys.exit('weightpath not found in %s' % pathFile)
- if assemblypath == '':
- assemblypath = dataPath + '/assembly_dir'
- if dataPath == 'config':
- try:
- assemblypath = cfg['assemblypath']
- except:
- sys.exit('assemblypath not found in %s' % pathFile)
- if assembly == True:
- searchpath = assemblypath
-
### check input arguments
seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath])
# group arguments
basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth]
ioArgs = [append, force, noCleanup, group, blast, db]
- pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath]
+ pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath]
coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation]
fasArgs = [fasoff, countercheck, coreFilter, minScore]
orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa]
otherArgs = [cpu, hyperthread, checkOff, debug, silent]
### run fdog
- runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False])
+ runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False])
### create PhyloProfile config file
createConfigPP(outpath, seqName, refspec)
diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh
index e5ca4a9..1eaf176 100755
--- a/fdog/setup/install_lib.sh
+++ b/fdog/setup/install_lib.sh
@@ -85,6 +85,7 @@ dependenciesUbuntu=(
perl-doc
locales
lib32z1
+ augustus
)
dependenciesMac=(
@@ -94,6 +95,7 @@ dependenciesMac=(
mafft
brewsci/bio/muscle
blast
+ augustus
)
if [ "$sys" == "Darwin" ]; then
@@ -108,7 +110,11 @@ else
sudo apt-get update -y
for i in "${dependenciesUbuntu[@]}"; do
echo $i
- sudo apt-get install -y -qq $i > /dev/null
+ if ["$i" == "augustus"]; then
+ sudo apt install augustus > /dev/null
+ else
+ sudo apt-get install -y -qq $i > /dev/null
+ fi
done
fi
@@ -119,6 +125,7 @@ dependencies=(
mafft
muscle
blastn
+ augustus
)
for i in "${dependencies[@]}"; do
diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh
index 7515ed2..28eb851 100755
--- a/fdog/setup/setup.sh
+++ b/fdog/setup/setup.sh
@@ -315,6 +315,8 @@ mafft
muscle
clustalw
blastp
+augustus
+tblastn
)
for i in "${dependencies[@]}"; do
@@ -324,6 +326,14 @@ for i in "${dependencies[@]}"; do
tool="clustalw2"
fi
fi
+ if [ $tool == "tblastn" ]; then
+ requiredver="2.9.0"
+ currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+ t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+ if [ $t == $currentver ]; then
+ echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+ fi
+ fi
if [ -z "$(which $tool)" ]; then
echo -e "\t\e[31mWARNING $tool not found!\e[0m"
flag=1
diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh
index cf1bc6d..73b8573 100755
--- a/fdog/setup/setup_conda.sh
+++ b/fdog/setup/setup_conda.sh
@@ -116,6 +116,7 @@ dependencies=(
mafft # for linsi
muscle
fasta36
+ augustus #for fdog.assembly
)
for i in "${dependencies[@]}"; do
@@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do
fi
elif [ "$tool" = "fasta36" ]; then
conda install -y -c bioconda fasta3
+ elif [ "$tool" = "augustus" ]; then
+ conda install -y -c bioconda augustus
else
conda install -y -c bioconda $i
fi
@@ -363,6 +366,8 @@ clustalw
mafft
muscle
fasta3
+augustus
+tblastn
)
for i in "${condaPkgs[@]}"; do
if [[ -z $(conda list | $grepprog "$i ") ]]; then
@@ -375,6 +380,13 @@ for i in "${condaPkgs[@]}"; do
progname="hmmsearch"
elif [ "$i" == "fasta3" ]; then
progname="fasta36"
+ elif [ "$i" == "tblastn" ]; then
+ requiredver="2.9.0"
+ currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')"
+ t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1)
+ if [ $t == $currentver ]; then
+ echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m"
+ fi
fi
if [ -z "$(which $progname)" ]; then
echo -e "\t\e[31m$i could not be installed\e[0m"