Skip to content

Commit

Permalink
Merge pull request #9 from mueli94/master
Browse files Browse the repository at this point in the history
bug fix fDOG parallelisation
  • Loading branch information
HannahBioI authored Mar 10, 2022
2 parents 14c852c + 3bb9075 commit 9856606
Show file tree
Hide file tree
Showing 6 changed files with 260 additions and 88 deletions.
57 changes: 46 additions & 11 deletions fdog/bin/hamstr.pl
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,11 @@
## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name
## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory
## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm
## 12.01.2022 (v13.4.6 - vinh) change aligner from MUSCLE to MAFFT if the sequence is longer than 12,000 aa

######################## start main ###########################################
my $version = "HaMStR v.13.4.5";
my $version = "HaMStR v.13.4.6";

######################## checking whether the configure script has been run ###
my $configure = 0;
if ($configure == 0){
Expand Down Expand Up @@ -2095,18 +2097,27 @@ sub identifyCoorthologsProt{
open (OUT, ">$tmpdirTmp/$localid.orth.fa") or die "failed to open $localid.orth.fa\n";
print OUT join "\n", @out;
close OUT;

## check sequence length
my $alignmentprog_co_tmp = $alignmentprog_co;
my $tooLong = checkSeqLen("$tmpdir/$localid.orth.fa");
if ($tooLong == 1) {
$alignmentprog_co_tmp = "mafft-linsi";
}
printOUT("Aligner: $alignmentprog_co_tmp\n");

## aligning sequences
if ($alignmentprog_co eq 'mafft-linsi'){
if ($alignmentprog_co_tmp eq 'mafft-linsi'){
`mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.orth.fa > "$tmpdirTmp/$localid.orth.aln"`;
}
elsif ($alignmentprog_co eq 'muscle') {
`$alignmentprog_co -quiet -in $tmpdir/$localid.orth.fa -out "$tmpdirTmp/$localid.orth.aln"`;
elsif ($alignmentprog_co_tmp eq 'muscle') {
`muscle -quiet -in $tmpdir/$localid.orth.fa -out "$tmpdirTmp/$localid.orth.aln"`;
}
else {
die "$alignmentprog_co is neither mafft-linsi nor muscle\n";
die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n";
}
if (! -e "$tmpdirTmp/$localid.orth.aln") {
die "something wrong running $alignmentprog_co\n";
die "something wrong running $alignmentprog_co_tmp\n";
}
## do the matrix caluclation
my $in = Bio::AlignIO->new(-format => 'fasta',
Expand Down Expand Up @@ -2146,6 +2157,21 @@ sub identifyCoorthologsProt{
}
printOUT(join "\n", @infofile);
}

######## sub check sequence length. If a sequence is longer than 12.000 aa,
######## change MUSCLE to MAFFT-LINSI (due to Segmentation fault issue of MUSCLE)
sub checkSeqLen {
my $file =$_[0];
my $out = `awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' "$file"`;
my @out = split("\n", $out);
foreach my $line (@out) {
if ($line !~ />/ & $line > 12000) {
return(1)
}
}
return(0)
}

######## sub checkCoorthologRef
sub checkCoorthologRef {
## relevant steps are
Expand All @@ -2158,18 +2184,27 @@ sub checkCoorthologRef {
open (OUT, ">$tmpdir/$localid.co.fa") or die "failed to open $localid.co.fa\n";
print OUT ">query\n$query\n>best\n$best\n>ref\n$ref\n";
close OUT;

## check sequence length
my $alignmentprog_co_tmp = $alignmentprog_co;
my $tooLong = checkSeqLen("$tmpdir/$localid.co.fa");
if ($tooLong == 1) {
$alignmentprog_co_tmp = "mafft-linsi";
}
printOUT("Aligner: $alignmentprog_co_tmp\n");

## aligning sequences
if ($alignmentprog_co eq 'mafft-linsi'){
if ($alignmentprog_co_tmp eq 'mafft-linsi'){
`mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.co.fa > "$tmpdir/$localid.co.aln"`;
}
elsif ($alignmentprog_co eq 'muscle') {
`$alignmentprog_co -quiet -in $tmpdir/$localid.co.fa -out "$tmpdir/$localid.co.aln"`;
elsif ($alignmentprog_co_tmp eq 'muscle') {
`muscle -quiet -in $tmpdir/$localid.co.fa -out "$tmpdir/$localid.co.aln"`;
}
else {
die "$alignmentprog_co is neither mafft-linsi nor muscle\n";
die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n";
}
if (! -e "$tmpdir/$localid.co.aln") {
die "something wrong running $alignmentprog_co in routine checkCoorthologRef\n";
die "something wrong running $alignmentprog_co_tmp in routine checkCoorthologRef\n";
}
## do the matrix caluclation
my $in = Bio::AlignIO->new(-format => 'fasta',
Expand Down
121 changes: 73 additions & 48 deletions fdog/bin/oneSeq.pl
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,11 @@
## Modified 31. May 2021 v2.3.1 (Vinh) - added auto annotation for fdogFas
## Modified 11. June 2021 v2.3.2 (Vinh) - fixed --append option
## Modified 16. June 2021 v2.4.0 (Vinh) - add checkOff option
## Modified 10. Mar 2022 v2.4.1 (Vinh) - fixed bug missing results in multiprocessing

############ General settings
my $version = 'oneSeq v.2.4.0';
my $version = 'oneSeq v.2.4.1';

##### configure for checking if the setup.sh script already run
my $configure = 0;
if ($configure == 0){
Expand Down Expand Up @@ -681,10 +683,39 @@
}
}
}
runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln);
runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln, 0);
$pm->finish;
}
$pm->wait_all_children;

### join result files
unless (-e $finalOutput) {
open(EXTENDEDFA, ">$finalOutput") or die "Cannot create $finalOutput\n";
} else {
open(EXTENDEDFA, ">>$finalOutput") or die "Cannot create $finalOutput\n";
}
opendir(my $dh, $outputPath) || die "Cannot open $outputPath: $!";
while (readdir $dh) {
if ($_ =~ /hamstrsearch_(.)+_$seqName(\.strict)*\.out$/) {
open(RESULT, "<$outputPath/$_") or warn "Cannot open $outputPath/$_!";
while (my $line = <RESULT>) {
chomp $line;
my @tmp = split(/\|/, $line);
my $seq = pop(@tmp);
splice(@tmp, 1, 1);
my $id = join("|", @tmp);
print EXTENDEDFA ">$id\n$seq\n";
}
close(RESULT);
unlink("$outputPath/$_") or warn "Cannot delete $outputPath/$_!"
}
}
closedir $dh;
close(EXTENDEDFA);
}
### remove duplicated seq in extended.fa
if (-e $finalOutput) {
addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput);
}
### remove duplicated seq in extended.fa
if (-e $finalOutput) {
Expand Down Expand Up @@ -1294,25 +1325,8 @@ sub checkOptions {

while (($input !~ /^[aor]/i) and ($breaker < 4)) {
$breaker++;
die "\nAn outputfile $finalOutput already exists. Please consider option -force for overwriting it or option -append for appending to it.\n"
# $input = getInput("\nAn outputfile $finalOutput already exists. Shall I overwrite it [o], or rename it [r], or [a] append to it?", 1);
# if (($breaker > 3) and ($input !~ /[aor]/i)){
# print "Please consider option -force or option -append.\n";
# die "No proper answer is given: Quitting\n";
# }
}
# if ($input =~ /[aA]/) {
# $append = 1;
# $force = 0;
# }
# elsif ($input =~ /[oO]/){
# $append = 0;
# $force = 1;
# }
# else {
# $append = 0;
# $force = 0;
# }
die "\nAn outputfile $finalOutput already exists. Please consider option --force for overwriting it or option --append for appending to it.\n"
}
}
if ($force){
## the user wants to overwrite
Expand All @@ -1338,7 +1352,7 @@ sub checkOptions {
## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing
}
else {
printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output", 1);
printOut("Option --append was selected, but the existing output was incomplete. Please restart with the --force option to overwrite the output", 1);
exit;
}
}
Expand Down Expand Up @@ -1593,7 +1607,7 @@ sub getBestOrtholog {
print $coreTaxonName, "\n";
}
}
runHamstr($coreTaxon, $seqName, $outputFa, $refSpec, $core_hitlimit, $core_rep, $corestrict, $coremode, $eval_blast, $eval_hmmer, $aln);
runHamstr($coreTaxon, $seqName, $outputFa, $refSpec, $core_hitlimit, $core_rep, $corestrict, $coremode, $eval_blast, $eval_hmmer, $aln, 1);
## check weather a candidate was found in the searched taxon
if(-e $candidatesFile) {

Expand Down Expand Up @@ -1932,7 +1946,7 @@ sub getTaxonName {
##################### perform the search for orthologs
# using the core-orthologs found in the previous steps
sub runHamstr {
my ($taxon, $seqName, $outputFa, $refSpec, $hitlimit, $rep, $sub_strict, $subcoremode, $ev_blst, $ev_hmm, $aln) = (@_);
my ($taxon, $seqName, $outputFa, $refSpec, $hitlimit, $rep, $sub_strict, $subcoremode, $ev_blst, $ev_hmm, $aln, $core) = (@_);
my $taxaDir = $taxaPath . $taxon;
printDebug("Running fdog: $taxon\t$seqName\t$outputFa\t$refSpec\t$taxaDir");
if (! -e $taxaDir) {
Expand Down Expand Up @@ -2008,29 +2022,31 @@ sub runHamstr {
printDebug(@hamstr);
system(@hamstr) == 0 or die "Error: fdog failed for " . $taxon . "\n";

if ($outputFa !~ /extended/){
$outputFa .= '.extended';
}
if(-e $resultFile) {
unless (-e $outputFa) {
open(EXTENDEDFA, ">$outputFa") or die "Cannot create $outputFa\n";
} else {
open(EXTENDEDFA, ">>$outputFa") or die "Cannot create $outputFa\n";
if ($core == 1) {
if ($outputFa !~ /extended/){
$outputFa .= '.extended';
}
my $resultFa = Bio::SeqIO->new(-file => $resultFile, '-format' => 'Fasta');
while(my $resultSeq = $resultFa->next_seq) {
if ($resultSeq->id =~ /$taxon\|(.)+\|[01]$/) {
my @tmpId = split("\\|", $resultSeq->id);
print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n";
if(-e $resultFile) {
unless (-e $outputFa) {
open(EXTENDEDFA, ">$outputFa") or die "Cannot create $outputFa\n";
} else {
open(EXTENDEDFA, ">>$outputFa") or die "Cannot create $outputFa\n";
}
my $resultFa = Bio::SeqIO->new(-file => $resultFile, '-format' => 'Fasta');
while(my $resultSeq = $resultFa->next_seq) {
if ($resultSeq->id =~ /$taxon\|(.)+\|[01]$/) {
my @tmpId = split("\\|", $resultSeq->id);
print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n";
}
}
# addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa);
} else {
# add seed sequence to output extended.fa if no ortholog was found in refSpec
if ($taxon eq $refSpec) {
addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa);
}
printDebug("$resultFile not found");
}
# addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa);
} else {
# add seed sequence to output extended.fa if no ortholog was found in refSpec
if ($taxon eq $refSpec) {
addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa);
}
printDebug("$resultFile not found");
}
}
#remove the created folders and files
Expand All @@ -2044,17 +2060,26 @@ sub runHamstr {
if (!$strict) {
$delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_" . $refSpec . "\"";
$delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\"";
$delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".out" . "\"";
if ($core == 1) {
$delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".out" . "\"";
}
} else {
$delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_strict" . "\"";
$delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\"";
$delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".strict.out" . "\"";
if ($core == 1) {
$delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".strict.out" . "\"";
}
}
printDebug("executing $delCommandFa", "executing $delCommandHmm");
if ($core == 1) {
printDebug("executing $delCommandHam");
}
printDebug("executing $delCommandFa", "executing $delCommandHmm", "executing $delCommandHam");
if ($autoclean) {
system ($delCommandFa) == 0 or die "Error deleting result files\n";
system ($delCommandHmm) == 0 or die "Error deleting result files\n";
system ($delCommandHam) == 0 or die "Error deleting result files\n";
if ($core == 1) {
system ($delCommandHam) == 0 or die "Error deleting result files\n";
}
}
}
else {
Expand Down
42 changes: 30 additions & 12 deletions fdog/mergeOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,21 @@
import os
from os import listdir as ldir
import argparse
import yaml

def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out):
settings = dict(
mainInput = '%s/%s' % (directory, phyloprofile),
fastaInput = '%s/%s' % (directory, ex_fasta)
)
if not domains_0 == None:
settings['domainInput'] = '%s/%s' % (directory, domains_0)
settings['clusterProfile'] = 'TRUE'
with open('%s.config.yml' % (out), 'w') as outfile:
yaml.dump(settings, outfile, default_flow_style = False)

def main():
version = '0.0.1'
version = '0.1.0'
parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.')
parser.add_argument('-i', '--input',
help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found',
Expand All @@ -42,23 +53,26 @@ def main():
domains_0 = None
domains_1 = None
ex_fasta = None
lines_seen = set()
for infile in ldir(directory):
if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile':
if not phyloprofile:
phyloprofile = open(out + '.phyloprofile', 'w')
phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n')
phyloprofile = out + '.phyloprofile'
phyloprofile_out = open(phyloprofile, 'w')
with open(directory + '/' + infile, 'r') as reader:
lines = reader.readlines()
for line in lines:
if not line == 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n':
phyloprofile.write(line)
if line not in lines_seen: # not a duplicate
phyloprofile_out.write(line)
lines_seen.add(line)
elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains':
if not domains_0:
domains_0 = open(out + '_forward.domains', 'w')
domains_0 = out + '_forward.domains'
domains_0_out = open(domains_0, 'w')
with open(directory + '/' + infile, 'r') as reader:
lines = reader.readlines()
for line in lines:
domains_0.write(line)
domains_0_out.write(line)
elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains':
if not domains_1:
domains_1 = open(out + '_reverse.domains', 'w')
Expand All @@ -68,19 +82,23 @@ def main():
domains_1.write(line)
elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa':
if not ex_fasta:
ex_fasta = open(out + '.extended.fa', 'w')
ex_fasta = out + '.extended.fa'
ex_fasta_out = open(ex_fasta, 'w')
with open(directory + '/' + infile, 'r') as reader:
lines = reader.readlines()
for line in lines:
ex_fasta.write(line)
ex_fasta_out.write(line)
if phyloprofile:
phyloprofile.close()
phyloprofile_out.close()
if domains_0:
domains_0.close()
domains_0_out.close()
if domains_1:
domains_1.close()
if ex_fasta:
ex_fasta.close()
ex_fasta_out.close()

createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out)
print('Done! Output files:\n%s/%s.*' % (directory,out))


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 9856606

Please sign in to comment.