Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions nextflow/ProteinFunction/bin/capture_expected_errors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ elif [ "${category}" == "pph2" ]; then
errors=("Failed to locate sequence position")
fi

# catch memory error
mem_error="Some of the step tasks have been OOM Killed."
if grep -q "${mem_error}" ${stderr_file}; then
exit 140
fi

# Capture expected errors
for error in "${errors[@]}"; do
if grep -q "${error}" ${stderr_file}; then
Expand Down
46 changes: 28 additions & 18 deletions nextflow/ProteinFunction/bin/store_polyphen_scores.pl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
use Digest::MD5 qw(md5_hex);

my ($species, $port, $host, $user, $pass, $dbname,
my ($species, $offline, $sqlite,
$port, $host, $user, $pass, $dbname,
$peptide, $output_file, $model) = @ARGV;

# Extract model name
Expand Down Expand Up @@ -63,25 +64,34 @@

# save the predictions to the database unless they are null matrices
if ( $any_results ){
my $var_dba = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(
'-species' => $species,
'-port' => $port,
'-host' => $host,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname
);
my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor
or die "Failed to get matrix adaptor";
if (!$offline){
my $var_dba = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(
'-species' => $species,
'-port' => $port,
'-host' => $host,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname
);
my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor
or die "Failed to get matrix adaptor";
# check if identical predictions are already stored
my $data = $pfpma->fetch_polyphen_predictions_by_translation_md5($md5, $model_name);
if (defined $data && defined $data->{matrix} && $data->{matrix} eq $pred_matrix->serialize) {
warn "Skipping: identical PolyPhen-2 predictions already stored in database\n"
} else {
$pfpma->store($pred_matrix);
}
$var_dba->dbc and $var_dba->dbc->disconnect_if_idle();
}

if ($sqlite){
my $dbh = DBI->connect("dbi:SQLite:dbname=$sqlite","","");
my $sth = $dbh->prepare("INSERT INTO predictions VALUES(?, ?, ?)");

# check if identical predictions are already stored
my $data = $pfpma->fetch_polyphen_predictions_by_translation_md5($md5, $model_name);
if (defined $data && defined $data->{matrix} && $data->{matrix} eq $pred_matrix->serialize) {
warn "Skipping: identical PolyPhen-2 predictions already stored in database\n"
} else {
$pfpma->store($pred_matrix);
my $attrib_id = $model_name eq "humdiv" ? 269 : 268;
$sth->execute($pred_matrix->translation_md5, $attrib_id, $pred_matrix->serialize)
}
$var_dba->dbc and $var_dba->dbc->disconnect_if_idle();
} else {
warn "Skipping: no PolyPhen-2 predictions to store\n";
}
44 changes: 26 additions & 18 deletions nextflow/ProteinFunction/bin/store_sift_scores.pl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
use Digest::MD5 qw(md5_hex);

my ($species, $port, $host, $user, $pass, $dbname,
my ($species, $offline, $sqlite,
$port, $host, $user, $pass, $dbname,
$peptide, $res_file) = @ARGV;

# parse the results file
Expand Down Expand Up @@ -50,25 +51,32 @@

# save the predictions to the database
if ($results_available == 1 ){
my $var_dba = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(
'-species' => $species,
'-port' => $port,
'-host' => $host,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname
);
my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor
or die "Failed to get matrix adaptor";
if (!$offline){
my $var_dba = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(
'-species' => $species,
'-port' => $port,
'-host' => $host,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname
);
my $pfpma = $var_dba->get_ProteinFunctionPredictionMatrixAdaptor
or die "Failed to get matrix adaptor";
# check if identical predictions are already stored
my $data = $pfpma->fetch_sift_predictions_by_translation_md5($md5);
if (defined $data && defined $data->{matrix} && $data->{matrix} eq $pred_matrix->serialize) {
warn "Skipping: identical SIFT predictions already stored in database\n"
} else {
$pfpma->store($pred_matrix);
}
$var_dba->dbc and $var_dba->dbc->disconnect_if_idle();
}

# check if identical predictions are already stored
my $data = $pfpma->fetch_sift_predictions_by_translation_md5($md5);
if (defined $data && defined $data->{matrix} && $data->{matrix} eq $pred_matrix->serialize) {
warn "Skipping: identical SIFT predictions already stored in database\n"
} else {
$pfpma->store($pred_matrix);
if ($sqlite){
my $dbh = DBI->connect("dbi:SQLite:dbname=$sqlite","","");
my $sth = $dbh->prepare("INSERT INTO predictions VALUES(?, ?, ?)");
$sth->execute($pred_matrix->translation_md5, 267, $pred_matrix->serialize)
}
$var_dba->dbc and $var_dba->dbc->disconnect_if_idle();
} else {
warn "Skipping: no SIFT predictions to store\n";
}
55 changes: 46 additions & 9 deletions nextflow/ProteinFunction/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ params.port = null
params.user = null
params.pass = null
params.database = null
params.offline = false

// SQLite database params
params.sqlite = params.offline
params.sqlite_dir = params.outdir.startsWith("/") ? params.outdir : "${workflow.launchDir}/${params.outdir}" // supports Unix-like only
params.sqlite_db = "${params.sqlite_dir}/${params.species}_PolyPhen_SIFT.db"
// SIFT params
params.sift_run_type = "NONE"
params.median_cutoff = 2.75 // as indicated in SIFT's README
Expand Down Expand Up @@ -53,12 +58,19 @@ if (params.help) {
--species VAL Latin species name (default: homo_sapiens);
PolyPhen-2 only works for human

Database options (mandatory):
Database options:
Ensembl variation database params -
--host VAL Server host
--port VAL Server port
--user VAL Server user
--pass VAL Server password
--database VAL Name of database
--offline No database connection to variation database

SQLite database params -
--sqlite 0 or 1, tells whether to create SQLite database (default: params.offline)
--sqlite_dir Directory where SQLite db would be created (default: params.outdir)
--sqlite_db Full path to SQLite db (--sqlite_dir would be ignored)

SIFT options:
--sift_run_type VAL SIFT run type:
Expand Down Expand Up @@ -87,7 +99,9 @@ include { translate_fasta } from './nf_modules/translations.nf'
include { clear_assemblies;
store_assemblies;
drop_translation_mapping;
store_translation_mapping } from './nf_modules/database.nf'
store_translation_mapping;
init_sqlite_db;
postprocess_sqlite_db } from './nf_modules/database.nf'
include { run_sift_pipeline } from './nf_modules/sift.nf'
include { run_pph2_pipeline } from './nf_modules/polyphen2.nf'

Expand All @@ -104,8 +118,12 @@ if (!params.translated) {
}
}

if (!params.host || !params.port || !params.user || !params.pass || !params.database) {
exit 1, "Error: --host, --port, --user, --pass and --database need to be defined"
if (!params.offline && (!params.host || !params.port || !params.user || !params.pass || !params.database)) {
exit 1, "ERROR: --host, --port, --user, --pass and --database need to be defined"
}

if (params.offline) {
log.info "INFO: --offline mode selected, --sqlite will be turned on by default. If you do not wish to generate SQLite db please use --sqlite 0."
}

// Check run type for each protein function predictor
Expand Down Expand Up @@ -159,6 +177,12 @@ def getFiles (files) {
}

workflow {
if (params.sqlite) {
sqlite_db_prep = init_sqlite_db()
} else {
sqlite_db_prep = "ready"
}

// Translate transcripts from GTF and FASTA if no translation FASTA is given
if (!params.translated) {
translate_fasta(getFiles(params.gtf), getFiles(params.fasta))
Expand All @@ -180,7 +204,7 @@ workflow {
md5: it.seqString.replaceAll(/\*/, "").md5() ]}

// Write translation mapping with transcript ID and MD5 hashes to database
if ( params.sift_run_type == "FULL" && params.pph_run_type == "FULL" ) {
if ( !params.offline && params.sift_run_type == "FULL" && params.pph_run_type == "FULL" ) {
drop_translation_mapping()
translation_mapping_wait = drop_translation_mapping.out
clear_assemblies()
Expand All @@ -193,16 +217,29 @@ workflow {
name: "translation_mapping.tsv",
storeDir: params.outdir,
newLine: true) { it.id + "\t" + it.md5 }
store_translation_mapping(translation_mapping, translation_mapping_wait)
store_assemblies(files.collect(), assemblies_wait)

if (!params.offline) {
store_translation_mapping(translation_mapping)
store_assemblies(files.collect(), assemblies_wait)
}

// Get unique translations based on MD5 hashes of their sequences
translated = translated.unique { it.md5 }

// Run protein function prediction
errors = Channel.of("# failure reasons")
if ( params.sift_run_type != "NONE" ) errors = errors.concat(run_sift_pipeline( translated ))
if ( params.pph_run_type != "NONE" ) errors = errors.concat(run_pph2_pipeline( translated ))

if ( params.sift_run_type != "NONE" ) {
errors = errors.concat(run_sift_pipeline( translated, sqlite_db_prep ).errors)
}

if ( params.pph_run_type != "NONE" ) {
errors = errors.concat(run_pph2_pipeline( translated, sqlite_db_prep ).errors)
}

if ( params.sqlite ) {
postprocess_sqlite_db(errors.collect())
}

errors
.collectFile(name: 'failure_reason.tsv', newLine: true, storeDir: params.outdir)
Expand Down
34 changes: 34 additions & 0 deletions nextflow/ProteinFunction/nf_modules/database.nf
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,37 @@ process get_current_MD5_translations {
EOF
"""
}

process init_sqlite_db {
output: stdout

cache false

"""
#!/usr/bin/perl

use DBI;

my \$dbh = DBI->connect("dbi:SQLite:dbname=${params.sqlite_db}","","");
\$dbh->do("DROP TABLE IF EXISTS predictions");
\$dbh->do("CREATE TABLE predictions(md5, analysis, matrix)");
"""
}

process postprocess_sqlite_db {
input:
val errors

output: stdout

cache false

"""
#!/usr/bin/perl

use DBI;

my \$dbh = DBI->connect("dbi:SQLite:dbname=${params.sqlite_db}","","");
\$dbh->do("CREATE INDEX md5_idx ON predictions(md5)");
"""
}
47 changes: 27 additions & 20 deletions nextflow/ProteinFunction/nf_modules/polyphen2.nf
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,12 @@ process store_pph2_scores {
val species
tuple val(peptide), path(weka_output), val(model)

output:
stdout

"""
store_polyphen_scores.pl $species ${params.port} ${params.host} \
${params.user} ${params.pass} ${params.database} \
store_polyphen_scores.pl ${species} ${params.offline} ${params.sqlite_db} \
${params.port} ${params.host} ${params.user} ${params.pass} ${params.database} \
${peptide.seqString} ${weka_output} ${model}
"""
}
Expand All @@ -118,25 +121,29 @@ include { delete_prediction_data; update_meta } from './database.nf'
include { filter_existing_translations } from './translations.nf'

workflow run_pph2_pipeline {
take: translated
take:
translated
sqlite_db_prep
main:
if ( params.pph_run_type == "UPDATE" ) {
translated = filter_existing_translations( "polyphen_%", translated )
wait = "ready"
} else if ( params.pph_run_type == "FULL" ) {
delete_prediction_data("polyphen_%")
wait = delete_prediction_data.out
get_pph2_version()
update_meta("polyphen_version", get_pph2_version.out)
}
// Run PolyPhen-2 and Weka
pph2 = run_pph2_on_all_aminoacid_substitutions(translated)

weka_model = Channel.of("HumDiv.UniRef100.NBd.f11.model",
"HumVar.UniRef100.NBd.f11.model")
weka = run_weka(weka_model, pph2.scores)
store_pph2_scores(wait, // wait for data deletion
params.species, weka)
if ( params.pph_run_type == "UPDATE" && !params.offline ) {
translated = filter_existing_translations( "polyphen_%", translated )
wait = "ready"
} else if ( params.pph_run_type == "FULL" && !params.offline ) {
delete_prediction_data("polyphen_%")
wait = delete_prediction_data.out
get_pph2_version()
update_meta("polyphen_version", get_pph2_version.out)
} else {
wait = "ready"
}
// Run PolyPhen-2 and Weka
pph2 = run_pph2_on_all_aminoacid_substitutions(translated)

weka_model = Channel.of("HumDiv.UniRef100.NBd.f11.model",
"HumVar.UniRef100.NBd.f11.model")
weka = run_weka(weka_model, pph2.scores)
store_pph2_scores(wait, // wait for data deletion
params.species, weka)
emit:
errors = pph2.errors
}
Loading