diff --git a/.DS_Store b/.DS_Store index fa2521e..bcbd073 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/README.md b/README.md index 52f11e2..8db83ce 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) +# Poster fDOG - Assembly +https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) diff --git a/fdog/.DS_Store b/fdog/.DS_Store index f638c26..a99a01c 100644 Binary files a/fdog/.DS_Store and b/fdog/.DS_Store differ diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl index b762854..09ec8ec 100755 --- a/fdog/bin/hamstr.pl +++ b/fdog/bin/hamstr.pl @@ -200,6 +200,7 @@ ######################## start main ########################################### my $version = "HaMStR v.13.4.6"; + ######################## checking whether the configure script has been run ### my $configure = 0; if ($configure == 0){ diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 0352c0d..2fe8333 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -135,6 +135,7 @@ ############ General settings my $version = 'oneSeq v.2.4.1'; + ##### configure for checking if the setup.sh script already run my $configure = 0; if ($configure == 0){ @@ -208,7 +209,6 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; -my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -313,15 +313,6 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; -#variables for fdog_goes_assembly -my $assembly; -my $augustusRefSpec; -my $avIntron; -my $lengthExtension; -my $assemblyPath; -my $searchTool = 'blast'; -my $matrix = 'blosum62'; -my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -384,15 +375,7 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa, - "assembly" => \$assembly, - "assemblypath=s" => \$assemblyPath, - "augustusRefSpec=s" => \$augustusRefSpec, - "avIntron=s" => \$avIntron, - "lengthExtension=s" => \$lengthExtension, - "searchTool=s" => \$searchTool, - "scoringmatrix=s" => \$matrix, - "dataPath=s" => \$dataPath + "searchTaxa=s" => \$searchTaxa ); $outputPath = abs_path($outputPath); @@ -404,8 +387,6 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; -$dataPath = abs_path($dataPath)."/"; -$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { @@ -415,7 +396,7 @@ initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); } - if (!defined $coreex && !defined $assembly) { + if (!defined $coreex) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -499,7 +480,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support && !$assembly){ +if ($fas_support){ if (!$weightPath) { createWeightFolder(); } @@ -508,7 +489,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex && !$assembly) { +if (!$coreex) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -646,12 +627,7 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if (!$assembly){ - $taxaPath = $genome_dir; - } - else{ - $taxaPath = $assembly_dir; - } + $taxaPath = $genome_dir; my @searchTaxa; unless ($searchTaxa) { unless($groupNode) { @@ -741,11 +717,15 @@ if (-e $finalOutput) { addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); } +### remove duplicated seq in extended.fa +if (-e $finalOutput) { + addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); +} push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -if(!$coreOnly && !$assembly){ +## Evaluation of all orthologs that are predicted by the final run +if(!$coreOnly){ my $fasStTime = gettime(); my $processID = $$; @@ -757,7 +737,7 @@ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support && !$assembly) { + if ($fas_support) { print "Starting the feature architecture similarity score computation...\n"; my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; unless ($countercheck) { @@ -770,21 +750,12 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } -if ($assembly){ - my $file_assembly_out; - $file_assembly_out = $outputPath . '/' . $seqName; - my $cmd_merge; - $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; - printDebug($cmd_merge); - system($cmd_merge); -} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -1194,10 +1165,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set if options reuseCore or assembly is used - if ($coreex || $assembly) { + ### check the presence of the pre-computed core set + if ($coreex) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1268,7 +1239,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and (!$coreex and !$assembly)) { + while(!$minCoreOrthologs and !$coreex) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1283,12 +1254,10 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - if (!$assembly){ - $inputSeq = fetchSequence($seqFile, $dataDir); - } + $inputSeq = fetchSequence($seqFile, $dataDir); ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '' && !$assembly) { + if($seqId eq '') { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1398,9 +1367,8 @@ sub checkOptions { #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex and !$assembly) { + if (!$coreex) { my $node; - #print "Testing coreex assembly\n"; $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ @@ -2673,7 +2641,7 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1 && !$assembly) { + if ($fasoff != 1) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); my @notFolder; diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b802b26..7027236 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2021 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + ############################ imports ########################################### import os import os.path @@ -8,7 +26,31 @@ import argparse import yaml import subprocess +import time +import shutil +import multiprocessing as mp + ########################### functions ########################################## +def check_path(path): + if not os.path.exists(path): + print(path + " does not exist. Exciting ...") + sys.exit() + +def check_ref_sepc(species_list, fasta_file): + file = open(fasta_file, "r") + lines = file.readlines() + species_file = [] + + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.append(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + def load_config(config_file): with open(config_file, 'r') as stream: try: @@ -16,82 +58,129 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + except subprocess.TimeoutExpired: + return 1 + def merge(blast_results, insert_length): + #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} number_regions = 0 + insert_length = int(insert_length) + score_list = [] for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) - #print("test") - #print(locations) size_list = len(locations) - j = 0 - while j < size_list-1: - i = 1 - while i < size_list-1: - - if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])): - #merge overlapping regions + i = j + 1 + while i < size_list: + if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5])): - #print(j) + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) + locations.pop(i) + size_list -= 1 + i -=1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 i += 1 j += 1 + for entry in locations: + score_list.append(entry[6]) number_regions += len(locations) blast_results[key] = locations - #print(blast_results) - return blast_results, number_regions + return blast_results, number_regions, score_list -def parse_blast(line, blast_results): - # format blast line: - #fomrat dictionary: {node_name: [(,)]} - #print(line) +def parse_blast(line, blast_results, cutoff): + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) evalue = float(line_info[3]) - #cut off - if evalue > 0.00001: + if evalue > cutoff: return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) split = node_name.split("|") - - # finding out on which strand tBLASTn founded a hit + # finding out on which strand tBLASTn found a hit if sstart < send: strand = "+" else: - sstart = line_info[2] - send = line_info[1] + sstart = int(line_info[2]) + send = int(line_info[1]) strand = "-" - - #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001 + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off if len(split) > 1: node_name = split[1] if node_name in blast_results: list = blast_results[node_name] - list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) blast_results[node_name] = list else: - blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] return blast_results, evalue -def candidate_regions(intron_length, evalue, tmp_path): +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -104,67 +193,144 @@ def candidate_regions(intron_length, evalue, tmp_path): if not line: break #parsing blast output - blast_results, evalue = parse_blast(line, blast_results) - #evalue cut-off - if not evalue <= evalue: - break + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) + if blast_results == {}: + blast_file.close() return 0,0 else: - candidate_regions, number_regions = merge(blast_results, intron_length) - #candidate_regions, number_regions = merge_regions(blast_results, cut_off) - #print(candidate_regions, number_regions) + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) + blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions -def extract_seq(region_dic, path, tmp_path): - #print(region_dic) +def extract_seq(region_dic, path, tmp_path, mode): + for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - -def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): + starting_subprocess(cmd, mode) + +def extract_sequence_from_to(name, file, start, end): + #print(name) + out = name + ".fasta" + if int(start) < 0: + start = 0 + with open(out,"w") as f: + for seq_record in SeqIO.parse(file, "fasta"): + f.write(">" + str(seq_record.id) + "\n") + sequence_length = len(seq_record.seq) + if int(end) > sequence_length: + end = sequence_length + #for testing only + #start = 0 + #end = len(seq_record.seq) + f.write(str(seq_record.seq[int(start):int(end)]) + "\n") + + return out, start, end + +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): output = open(candidatesOutFile, "w") for key in regions: locations = regions[key] counter = 0 for i in locations: + # some variables counter += 1 start = str(i[0] - length_extension) end = str(i[1] + length_extension) name = key + "_" + str(counter) - #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff") - + # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) + #print(cmd) + starting_subprocess(cmd, 'silent') + # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + output.close() - sequence_file = open(tmp_path + name + ".aa", "r") - lines = sequence_file.readlines() - for line in lines: - if line[0] == ">": - id = line.replace(">", "") - header = ">" + group + "|" + ass_name + "|" + name + "_" + id - output.write(header) - else: - output.write(line) - sequence_file.close() +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): + output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + #some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + region.write(file + "\t" + str(start) + "\t" + str(end)) + #metaeuk call + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + #print(cmd) + # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".fas", "r") + lines = sequence_file.readlines() + #print(lines) + id = 0 + for line in lines: + if line[0] == ">": + id += 1 + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n" + output.write(header) + else: + output.write(line) + sequence_file.close() + + gff_file = open(tmp_path + name + ".gff", "r") + lines = gff_file.readlines() + new_lines = [] + for line in lines: + values = line.split("\t") + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) + new_lines.append("\t".join(values)) + gff_file.close() + gff_file = open(tmp_path + name + ".gff", "w") + for line in new_lines: + gff_file.write(line) + gff_file.close() + except FileNotFoundError: + pass output.close() def searching_for_db(assembly_path): - #print("test: " + str(assembly_path) + "\n") + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] check = True for end in db_endings: - #print(assembly_path + end + "\n") - check = check and os.path.exists(assembly_path + end) - #print(check) + if not any(File.endswith(end) for File in os.listdir(assembly_path)): + check = False return check def get_distance_biopython(file, matrix): + #print(file) aln = AlignIO.read(open(file), 'fasta') calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) @@ -216,16 +382,28 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": os.system("muscle -quiet -in " + output_file + " -out " + aln_file) #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if not os.path.exists(aln_file): + print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + elif msaTool == "mafft-linsi": #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - #d_ref = get_distance(aln_file, best_hit, ref) - #d = get_distance(aln_file, best_hit, candidate_name) - distances = get_distance_biopython(aln_file, matrix) + try: + distances = get_distance_biopython(aln_file, matrix) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) + except ValueError: + pass + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" - distance_hit_query = distances[best_hit, candidate_name] - distance_ref_hit = distances[best_hit, ref] + + + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] if distance_ref_hit < distance_hit_query: #accepted @@ -235,7 +413,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #rejected return 0, distance_ref_hit, distance_hit_query -def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path): +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): # the backward search uses the genes predicted from augustus and makes a blastp search #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa seedDic = getSeedInfo(fasta_path) @@ -248,10 +426,11 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva try: id_ref = seedDic[fdog_ref_species] except KeyError: - print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": - os.system("blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) else: print("diamonds are the girls best friends") ##### diamond call @@ -265,45 +444,46 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva id, gene, evalue = (line.replace("\n", "")).split("\t") gene_name = gene.split("|")[2] if gene_name != old_name: - print("candidate:%s"%(gene_name)) - print("blast-hit:%s"%(id)) + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" min = float(evalue) if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnothitting\n") + print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnot hitting\n") + print("\tnot hitting\n") if mode == "debug" else "" old_name = gene_name if orthologs == []: - print("No hit in the backward search, ...exciting") + #print("No hit in the backward search, ...exciting") return 0, seed else: @@ -328,15 +508,16 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species " + species + "\n") + print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: - print("The species " + species + " isn't part of the core ortholog group, ... exciting") + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed - os.system("blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) alg_file = open(tmp_path + "/blast_" + species, "r") lines = alg_file.readlines() alg_file.close() @@ -355,23 +536,54 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(species) #print(orthologs_new) + #print(orthologs) if species == fdog_ref_species: orthologs = orthologs_new else: orthologs = orthologs & orthologs_new - if orthologs == {}: - print("No ortholog was found with option --strict") + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") return 0, seed - - #print(orthologs) + orthologs = set(orthologs) return list(orthologs), seed -def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): - #print("addSequences") - #print(sequenceIds) +def addRef(output, core_fasta, species_list): #print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + #print(item) + candidate_fasta = item[1] + sequenceIds = item[0] + if sequenceIds == 0 or sequenceIds == []: + continue + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + +def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + output_file = open(output, "a+") if refBool == False: seq_records_core = readFasta(core_fasta) @@ -382,20 +594,24 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(">" + entry_core.id + "\n") output_file.write(str(entry_core.seq) + "\n") - seq_records_candidate = readFasta(candidate_fasta) - seq_records_candidate = list(seq_records_candidate) - for entry_candidate in seq_records_candidate: - #print(entry_candidate.id) - #print(sequenceIds) - if entry_candidate.id in sequenceIds: - output_file.write(">" + entry_candidate.id + "\n") - output_file.write(str(entry_candidate.seq) + "\n") + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") output_file.close() return 0 def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] + #fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") @@ -404,18 +620,22 @@ def createFasInput(orthologsOutFile, mappingFile): ncbi_id = (seq.id.split("@"))[1] mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") - + mappingFile.close() return fas_seed_id def cleanup(tmp, tmp_path): if tmp == False: - os.system('rm -r ' + tmp_path) - -def checkOptions(): - pass - #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! + timeout = time.time() + 60*1 + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): + if len(candidate_names) == 1: + return candidate_names + candidates = readFasta(candidatesFile) ref = readFasta(fasta) @@ -431,18 +651,19 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break + already_written = [] for record in candidates: for name in candidate_names: - if name in record.id: - f.write(">" + name + "\n") - f.write(str(record.seq) + "\n") + if name == record.id: + if name not in already_written: + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + already_written.append(name) f.close() if msaTool == "muscle": os.system("muscle -quiet -in " + out + " -out " + aln_file) - #print("muscle -quiet -in " + output_file + " -out " + aln_file) elif msaTool == "mafft-linsi": - #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) distances = get_distance_biopython(aln_file, matrix) @@ -452,19 +673,160 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for name in candidate_names: distance = distances[ref_id , name] - if distance < min_dist: + if distance <= min_dist: min_dist = distance min_name = name - checked = [] - + checked = [min_name] for name in candidate_names: - if distances[min_name , name] < distances[min_name , ref_id]: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: checked.append(name) return checked +def clean_fas(path, file_type): + file = open(path, "r") + lines = file.readlines() + file.close() + file = open(path,"w") + + for line in lines: + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t", 1) + id = long_id.split("|")[0] + new_line = id + "\t" + remain + + file.write(new_line) + file.close() + +def ortholog_search_tblastn(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args + output = [] + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + #orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + #mappingFile = out + "/tmp/" + group + ".mapping.txt" + + output.append("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + db_check = searching_for_db(blast_dir_path) + + if db_check == 0: + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() + exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start + if exit_code == 1: + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output + + output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output + + else: + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + + if gene_prediction == "augustus": + ############### make Augustus PPX search ################################### + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + else: + time_metaeuk_start = time.time() + if metaeuk_db == '': + db = fasta_path + else: + db = metaeuk_db + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) + time_metaeuk_end = time.time() + time_metaeuk = time_metaeuk_end - time_metaeuk_start + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + #print("No genes found at candidate regions\n") + return [], candidatesOutFile, output + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile, output + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + return reciprocal_sequences, candidatesOutFile, output + +def blockProfiles(core_path, group, mode, out): + + ######################## paths ################################ + msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) + profile_path = out + "/tmp/" + group + ".prfl" + + ######################## block profile ##################################### + + print("Building a block profile ...") + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("\t ...finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + print(" \t ...finished \n") + + return profile_path + +def consensusSequence(core_path, group, mode, out): + + ######################## paths ################################ + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) + consensus_path = out + "/tmp/" + group + ".con" + + ######################## consensus sequence ################################ + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("\t ...finished\n") + + return consensus_path + class Logger(object): def __init__(self, file): self.file = file @@ -478,24 +840,23 @@ def write(self, message): def flush(self): pass - def main(): - #################### handle user input ######################################## - - version = '0.0.1' + #################### handle user input ##################################### + start = time.time() + version = '0.1.3' + ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) - + ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) - required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) - + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) + ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) @@ -508,34 +869,34 @@ def main(): optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') - optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') - optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) - + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) + optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) + optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='') args = parser.parse_args() # required group = args.gene - augustus_ref_species = args.augustusRefSpec fdog_ref_species = args.refSpec #paths user input assemblyDir = args.assemblyPath dataPath = args.dataPath core_path = args.coregroupPath - out = args.out + "/" + out = args.out pathFile = args.pathFile #I/O tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - filter = args.filter - if filter == True or filter == 'yes': - filter = 'yes' - else: - filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -544,35 +905,38 @@ def main(): msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa - if taxa == '': - taxa =[] - else: - taxa = taxa.split(",") fasoff = args.fasoff - searchTaxon = args.searchTaxon + searchTaxa = args.searchTaxa silent = args.silent + debug = args.debug + force = args.force + append = args.append + parallel = args.parallel + augustus_ref_species = args.augustusRefSpec + metaeuk_db = args.metaeukDb - ###################### How to handling std output ########################## - # if silent == True: - # print(out + "fdog.log \n") - # f = open(out + "fdog.log", "a+") - # sys.stdout = f - # else: - # print(out + "fdog.log \n") - # sys.stdout = Logger(out) - - try: - f = open(out + "fdog.log", "a+") - except FileNotFoundError: - f = open(out + "fdog.log", "w") - + #gene prediction tool + augustus = args.augustus + if augustus == True: - if silent == True: - sys.stderr = f - sys.stdout = f + if augustus_ref_species == '': + print("Augustus reference species is required when using Augustus as gene prediction tool") + return 1 + gene_prediction = "augustus" else: - sys.stdout = Logger(f) + gene_prediction = "metaeuk" + # output modes + if debug == True and silent == True: + print("It's not possible to use booth modes, please restart and use --debug or --silent") + return 1 + else: + if debug == True: + mode = 'debug' + elif silent == True: + mode = 'silent' + else: + mode = 'normal' #checking paths if dataPath == '': @@ -590,178 +954,183 @@ def main(): except: dataPath = 'config' - if assemblyDir == '': - assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() - if core_path == '': - core_path = out + '/core_orthologs/' - - - # user input has to be checked here before fDOGassembly continues - - assembly_names = os.listdir(assemblyDir) - - - - ########################## some variables ################################## - - refBool = False # checks if sequences of reference species were already part of the extended.fa file - ########### paths ########### - - msa_path = core_path + "/" + group +"/"+ group + ".aln" - hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" - fasta_path = core_path + "/" + group +"/"+ group + ".fa" - consensus_path = out + "/tmp/" + group + ".con" - profile_path = out + "/tmp/" + group + ".prfl" - - ###################### create tmp folder ################################### - - os.system('mkdir ' + out + '/tmp') - - ######################## consensus sequence ################################ - - #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence \n") - os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) - print("consensus sequence is finished\n") - - ######################## block profile ##################################### - - print("Building a block profile \n") - cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - #os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path) - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - - #print(os.path.getsize(profile_path)) - if int(os.path.getsize(profile_path)) > 0: - print("block profile is finished \n") else: - print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + group +"/"+ group + "_new.aln" - cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - print("block profile is finished \n") - - - searchBool = False - - for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True - - ################### path definitions ################################### - os.system('mkdir ' + out + '/tmp/' + asName) - tmp_path = out + "/tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" + if out[-1] != "/": + out = out + "/" + check_path(out) + + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + refBool = False + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + elif append == True: + out = out + '/' + group + '/' + refBool = True else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - ######################## tBLASTn ########################################### - - #database anlegen + refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + refBool = False - db_check = searching_for_db(db_path) - #print(assembly_path) - if db_check == 0: - print("creating a blast data base \n") - os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path) - print("database is finished \n") - else: - print('blast data base exists already, continuing...') + if core_path == '': + core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + check_path(core_path) + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + check_path(assemblyDir) - #make a tBLASTn search against the new database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + if metaeuk_db != '': + check_path(metaeuk_db) - print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -out ' + tmp_path + '/blast_results.out') - print("tBLASTn search is finished") - ################### search for candidate regions and extract seq ########### + try: + f = open(out + "/fdog.log", "a+") + except FileNotFoundError: + f = open(out + "/fdog.log", "w") - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + ################## How to handle std output and std error ################## - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found") - continue + if mode == 'silent': + sys.stderr = f + sys.stdout = f + else: + sys.stdout = Logger(f) + ########################### other variables ################################ + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + if len(searchTaxa) > 1: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa else: - print(str(number_regions) + " candiate regions were found. Extracting sequences...") - extract_seq(regions, db_path, tmp_path) - - ############### make Augustus PPX search ################################### - print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path) - print("augustus is finished \n") + if searchTaxa[0] in os.listdir(assemblyDir): + assembly_names = searchTaxa + elif os.path.isfile(searchTaxa[0]): + with open(searchTaxa[0]) as file: + lines = file.readlines() + assembly_names = [line.rstrip() for line in lines] + else: + print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) - ################# backward search to filter for orthologs################### - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) + ################################# paths #################################### + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) + tmp_folder = out + "/tmp" - if reciprocal_sequences == 0: - print("No ortholog fulfilled the reciprocity criteria") - if searchTaxon == '': - continue - else: - cleanup(tmp, tmp_path) - return 1 + ########### is/are fDOG reference species part of ortholog group? ########## - ################## checking accepted genes for co-orthologs ########################## - print(reciprocal_sequences) - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) + ###################### create tmp folder ################################### + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') - ################ add sequences to extended.fa in the output folder########## - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True + print("Gene: " + group) + print("fDOG reference species: " + fdog_ref_species + " \n") - ############### make Annotation with FAS ################################### - if searchTaxon != '' and fasoff == False: - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('mkdir ' + tmp_path + 'anno_dir') - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) + ###################### preparations ######################################## + if augustus == True: + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode, out) + profile_path = blockProfiles(core_path, group, mode, out) + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + else: + #print("test") + profile_path = "" + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode, out) + #concatinade core_group sequences if metaeuk should be run without tblastn + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + + + ###################### ortholog search ##################################### + + ortholog_sequences = [] + time_ortholog_start = time.time() + + if parallel == True: + ##################### parallel computation ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) + + results = (pool.imap_unordered(ortholog_search_tblastn, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append([i[0], i[1]]) + for k in i[2]: + print(k) + else: + ###################### computation species wise ################ + for asName in assembly_names: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + for k in output_ortholog_search: + print(k) + + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + mappingFile = out + "/tmp/" + group + ".mapping.txt" - if refBool == False and searchTaxon == '': - print("No orthologs found. Exciting ...") - cleanup(tmp, tmp_path) - return 1 + if fasoff == False: + fas = time.time() + print("Calculating FAS scores ...") - if fasoff == False and searchTaxon == '': tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group ) - + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + starting_subprocess(cmd, 'silent') + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") ################# remove tmp folder ######################################## - if searchTaxon != '': - cleanup(tmp, tmp_path) - else: - cleanup(tmp, out + "/tmp/") + end = time.time() + time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") + print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + sys.stdout = sys.__stdout__ f.close() - + cleanup(tmp, tmp_folder) if __name__ == '__main__': main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py deleted file mode 100644 index ea6e084..0000000 --- a/fdog/mergeAssemblyOutput.py +++ /dev/null @@ -1,122 +0,0 @@ -# -*- coding: utf-8 -*- - -####################################################################### -# Copyright (C) 2020 Vinh Tran -# -# This script is used to merge all output files (.extended.fa, .phyloprofile, -# _forward.domains, _reverse.domains) in a given directory into one file each. -# -# This script is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for -# more details -# -# Contact: hannah.muelbaier@stud.uni-frankfurt.de -# -####################################################################### - -import sys -import os -from os import listdir as ldir -import argparse -from pathlib import Path - -def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', - action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) - parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) - args = parser.parse_args() - - directory = args.input - out = args.output - cleanup = args.cleanup - if not os.path.exists(os.path.abspath(directory)): - sys.exit('%s not found' % directory) - else: - directory = os.path.abspath(directory) - - phyloprofile = None - set_phylo = set() - domains_0 = None - set_domains_f = set() - domains_1 = None - set_domains_r = set() - ex_fasta = None - set_fasta = set() - header_bool = False - for infile in ldir(directory): - if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': - if not phyloprofile: - phyloprofile = open(out + '.phyloprofile', 'w') - phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: - phyloprofile.write(line) - if len(lines) > 1: - set_phylo = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': - if not domains_0: - domains_0 = open(out + '_forward.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_f: - domains_0.write(line) - if len(lines) > 1: - set_domains_f = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': - if not domains_1: - domains_1 = open(out + '_reverse.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_r: - domains_1.write(line) - if len(lines) > 1: - set_domains_r = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': - if not ex_fasta: - ex_fasta = open(out + '.extended.fa', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - header = set() - #print(set_fasta) - for line in lines: - if line[0] == ">": - header.add(line) - if line not in set_fasta: - ex_fasta.write(line) - header_bool = True - else: - header_bool = False - else: - if header_bool == True: - ex_fasta.write(line) - set_fasta = header - if cleanup == True: - os.remove(directory + '/' +infile) - - if phyloprofile: - phyloprofile.close() - if domains_0: - domains_0.close() - if domains_1: - domains_1.close() - if ex_fasta: - ex_fasta.close() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 7d73cdd..be552a7 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -107,10 +107,9 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) if len(coreCompilationJobs) > 0: pool = mp.Pool(cpu) coreOut = [] @@ -132,7 +131,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -331,14 +330,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -420,15 +411,6 @@ def main(): silent = False else: silent = True - - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix ### check fas if not fasoff: diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 7e0f858..f239f90 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -67,13 +67,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) + (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -165,28 +165,7 @@ def runSingle(args): cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # add assembly options - (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs - if assembly == True: - cmd = cmd + ' -assembly' - cmd = cmd + ' -reuseCore' - if not augustusRefSpec == '': - cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec - else: - sys.exit('An augutus reference species is requiered by using the option --assembly') - if not avIntron == '': - cmd = cmd + ' -avIntron=%s' % avIntron - if not lengthExtension == '': - cmd = cmd + ' -lengthExtension=%s' % lengthExtension - if not assemblyFile == '': - cmd = cmd + ' -assemblyFile=%s' % assemblyFile - if not searchTool == '': - cmd = cmd + ' -searchTool=%s' % searchTool - if not matrix == '': - cmd = cmd + ' -scoringmatrix=%s' % matrix - if not dataPath == '': - cmd = cmd + ' -dataPath=%s' % dataPath - #print(cmd) + # print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -238,8 +217,6 @@ def main(): optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -326,14 +303,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -353,7 +322,6 @@ def main(): searchpath = args.searchpath weightpath = args.weightpath pathFile = args.pathFile - assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -415,15 +383,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### get fdog and data path dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') @@ -471,29 +430,19 @@ def main(): except: sys.exit('weightpath not found in %s' % pathFile) - if assemblypath == '': - assemblypath = dataPath + '/assembly_dir' - if dataPath == 'config': - try: - assemblypath = cfg['assemblypath'] - except: - sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath - ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, checkOff, debug, silent] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) ### create PhyloProfile config file createConfigPP(outpath, seqName, refspec) diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index e5ca4a9..1eaf176 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -85,6 +85,7 @@ dependenciesUbuntu=( perl-doc locales lib32z1 + augustus ) dependenciesMac=( @@ -94,6 +95,7 @@ dependenciesMac=( mafft brewsci/bio/muscle blast + augustus ) if [ "$sys" == "Darwin" ]; then @@ -108,7 +110,11 @@ else sudo apt-get update -y for i in "${dependenciesUbuntu[@]}"; do echo $i - sudo apt-get install -y -qq $i > /dev/null + if ["$i" == "augustus"]; then + sudo apt install augustus > /dev/null + else + sudo apt-get install -y -qq $i > /dev/null + fi done fi @@ -119,6 +125,7 @@ dependencies=( mafft muscle blastn + augustus ) for i in "${dependencies[@]}"; do diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 7515ed2..28eb851 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -315,6 +315,8 @@ mafft muscle clustalw blastp +augustus +tblastn ) for i in "${dependencies[@]}"; do @@ -324,6 +326,14 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi + if [ $tool == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi + fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" flag=1 diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index cf1bc6d..73b8573 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -116,6 +116,7 @@ dependencies=( mafft # for linsi muscle fasta36 + augustus #for fdog.assembly ) for i in "${dependencies[@]}"; do @@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do fi elif [ "$tool" = "fasta36" ]; then conda install -y -c bioconda fasta3 + elif [ "$tool" = "augustus" ]; then + conda install -y -c bioconda augustus else conda install -y -c bioconda $i fi @@ -363,6 +366,8 @@ clustalw mafft muscle fasta3 +augustus +tblastn ) for i in "${condaPkgs[@]}"; do if [[ -z $(conda list | $grepprog "$i ") ]]; then @@ -375,6 +380,13 @@ for i in "${condaPkgs[@]}"; do progname="hmmsearch" elif [ "$i" == "fasta3" ]; then progname="fasta36" + elif [ "$i" == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $progname)" ]; then echo -e "\t\e[31m$i could not be installed\e[0m"