diff --git a/CHANGELOG.md b/CHANGELOG.md index dfc2a0d..347aed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +## [1.4.0] - 2019-03-21 +### Changed +- Supports for multi-round assembly polishing using both long and short reads. +- Setting adjustment for long-read filtering and downsampling. +- Software version updates for a number of dependencies. +### Fixed +- Compatibility issues due to recent version updates of conda an bioconda. +- Typos in the installation script. + ## [1.3.1] - 2019-01-22 ### Added - A script for generated demultiplexed fastq reads based on nanopore's guppy demultiplexing summary file. diff --git a/Example_Outputs/SK1.assembly.final.fa.gz b/Example_Outputs/SK1.assembly.final.fa.gz index 5d041e7..dcb7c82 100644 Binary files a/Example_Outputs/SK1.assembly.final.fa.gz and b/Example_Outputs/SK1.assembly.final.fa.gz differ diff --git a/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.INDEL.vcf.gz b/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.INDEL.vcf.gz index 07d08ac..49c8378 100644 Binary files a/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.INDEL.vcf.gz and b/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.INDEL.vcf.gz differ diff --git a/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.SNP.vcf.gz b/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.SNP.vcf.gz index 3854c43..93fe8cd 100644 Binary files a/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.SNP.vcf.gz and b/Example_Outputs/SK1.assembly.final.filter.mummer2vcf.SNP.vcf.gz differ diff --git a/Example_Outputs/SK1.assembly.final.filter.pdf b/Example_Outputs/SK1.assembly.final.filter.pdf index 0aa8e40..48ac43d 100644 Binary files a/Example_Outputs/SK1.assembly.final.filter.pdf and b/Example_Outputs/SK1.assembly.final.filter.pdf differ diff --git a/Example_Outputs/SK1.assembly.final.stats.txt b/Example_Outputs/SK1.assembly.final.stats.txt index 4a1376c..b286074 100644 --- a/Example_Outputs/SK1.assembly.final.stats.txt +++ b/Example_Outputs/SK1.assembly.final.stats.txt @@ -1,8 +1,8 @@ total sequence count: 34 -total sequence length: 12448004 +total sequence length: 12448003 min sequence length: 1248 max sequence length: 1480301 -mean sequence length: 366117.76 +mean sequence length: 366117.74 median sequence length: 60826.50 N50: 923676 L50: 6 diff --git a/Example_Outputs/SK1.final.cds.fa.gz b/Example_Outputs/SK1.final.cds.fa.gz index fcd514d..0075582 100644 Binary files a/Example_Outputs/SK1.final.cds.fa.gz and b/Example_Outputs/SK1.final.cds.fa.gz differ diff --git a/Example_Outputs/SK1.final.gff3.gz b/Example_Outputs/SK1.final.gff3.gz index 39247da..af1e457 100644 Binary files a/Example_Outputs/SK1.final.gff3.gz and b/Example_Outputs/SK1.final.gff3.gz differ diff --git a/Example_Outputs/SK1.final.manual_check.list b/Example_Outputs/SK1.final.manual_check.list index b54d801..a0482e6 100644 --- a/Example_Outputs/SK1.final.manual_check.list +++ b/Example_Outputs/SK1.final.manual_check.list @@ -218,25 +218,22 @@ SK1_G0056290|SK1_G0056290.mRNA.1 unexpected start & end codons based on standard SK1_G0057300|SK1_G0057300.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;incorrect CDS length SK1_G0057390|SK1_G0057390.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0057410|SK1_G0057410.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057460|SK1_G0057460.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057480|SK1_G0057480.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0057470|SK1_G0057470.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0057490|SK1_G0057490.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0057500|SK1_G0057500.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057510|SK1_G0057510.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057570|SK1_G0057570.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057590|SK1_G0057590.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057750|SK1_G0057750.mRNA.1 incorrect CDS length -SK1_G0057780|SK1_G0057780.mRNA.1 incorrect CDS length -SK1_G0057800|SK1_G0057800.mRNA.1 incorrect CDS length +SK1_G0057560|SK1_G0057560.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0057580|SK1_G0057580.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0057770|SK1_G0057770.mRNA.1 incorrect CDS length +SK1_G0057950|SK1_G0057950.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0057960|SK1_G0057960.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0057970|SK1_G0057970.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0057980|SK1_G0057980.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0058000|SK1_G0058000.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0058010|SK1_G0058010.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) SK1_G0058020|SK1_G0058020.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0058030|SK1_G0058030.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0058070|SK1_G0058070.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0058160|SK1_G0058160.mRNA.1 unexpected start codon based on standard genentic code;your selected code table is 1;incorrect CDS length -SK1_G0058260|SK1_G0058260.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) -SK1_G0058280|SK1_G0058280.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0058060|SK1_G0058060.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0058150|SK1_G0058150.mRNA.1 unexpected start codon based on standard genentic code;your selected code table is 1;incorrect CDS length +SK1_G0058250|SK1_G0058250.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) +SK1_G0058270|SK1_G0058270.mRNA.1 unexpected start & end codons based on standard genentic code;your selected code table is 1;internal stop codon(s) cox2|cox2.mRNA.1 unexpected stop codon based on standard genentic code;your selected code table is 3;incorrect CDS length orf474|orf474.mRNA.1 unexpected start codon based on standard genentic code;your selected code table is 3;incorrect CDS length orf90|orf90.mRNA.1 unexpected start codon based on standard genentic code;your selected code table is 3 diff --git a/Example_Outputs/SK1.final.pep.fa.gz b/Example_Outputs/SK1.final.pep.fa.gz index 97bae49..2f2d82a 100644 Binary files a/Example_Outputs/SK1.final.pep.fa.gz and b/Example_Outputs/SK1.final.pep.fa.gz differ diff --git a/Example_Outputs/SK1.final.trimmed_cds.fa.gz b/Example_Outputs/SK1.final.trimmed_cds.fa.gz index b3a7aae..f5a1c49 100644 Binary files a/Example_Outputs/SK1.final.trimmed_cds.fa.gz and b/Example_Outputs/SK1.final.trimmed_cds.fa.gz differ diff --git a/Manual.pdf b/Manual.pdf index e8306c3..f76778f 100644 Binary files a/Manual.pdf and b/Manual.pdf differ diff --git a/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh b/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh index 82d5682..2fdbe69 100755 --- a/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh +++ b/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh @@ -12,27 +12,31 @@ reads="./../00.Long_Reads/YGL3210.fq.gz" # The file path of the long reads file reads_type="nanopore-raw" # The long reads data type: "pacbio-raw" or "pacbio-corrected" or "nanopore-raw" or "nanopore-corrected". run_filtering="yes" # Whether to filter the reads: "yes" or "no". Default = "yes". genome_size="12500000" # The haploid genome size (in bp) of sequenced organism. Default = "12500000" (i.e. 12.5 Mb for the budding yeast S. cereviaie genome). This is used to calculate targeted sequencing coverage after read filtering (see below). -post_filtering_coverage="30" # Targeted sequencing coverage after read filtering. Default = "30" (i.e. 30x coverage). +post_filtering_coverage="40" # Targeted sequencing coverage after read filtering. Default = "40" (i.e. 40x coverage). threads=1 # The number of threads to use. Default = "1". ####################################### # process the pipeline -filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) -echo "" -echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" -echo "" if [[ "$reads_type" == "nanopore-raw" || "$reads_type" == "nanopore-corrected" ]] then $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --discard_middle --threads $threads > $prefix.porechop.summary.txt if [[ "$run_filtering" == "yes" ]] then - $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz + filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) + echo "" + echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" + echo "" + $filtlong_dir/filtlong --min_length 1000 --mean_q_weight 10 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz fi else if [[ "$run_filtering" == "yes" ]] then - $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $reads | gzip > $prefix.filtlong.fastq.gz + filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) + echo "" + echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" + echo "" + $filtlong_dir/filtlong --min_length 1000 --mean_q_weight 10 --target_bases $filtlong_target_bases $reads | gzip > $prefix.filtlong.fastq.gz fi fi diff --git a/Project_Template/00.Long_Reads/LRSDAY.00.PacBio.RSII_bax2bam.sh b/Project_Template/00.Long_Reads/LRSDAY.00.PacBio.RSII_bax2bam.sh index 48a1ee8..3e7671e 100755 --- a/Project_Template/00.Long_Reads/LRSDAY.00.PacBio.RSII_bax2bam.sh +++ b/Project_Template/00.Long_Reads/LRSDAY.00.PacBio.RSII_bax2bam.sh @@ -8,7 +8,7 @@ source ./../../env.sh # set project-specific variables prefix="SK1.SMRTCell.1" # The file name prefix for the output files. For the testing example, run this script four times with the prefix of "SK1.SMRTCell.1", "SK1.SMRTCell.2", "SK1.SMRTCell.3", and "SK1.SMRTCell.4" respectively. -pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/SK1.SMRTCell.1.RSII_bax.fofn" # The fofn file containing file paths to the PacBio RSII bax reads from the same SMRT cell. If you have data from multiple SMRT cells, please run this script sepearately for each of them. Do not mix reads from different SMRT cells even though they come from the same sample. For the testing example, you can set pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" to let this parameter to be automatically set up based on the prefix parameter. +pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" # The fofn file containing file paths to the PacBio RSII bax reads from the same SMRT cell. If you have data from multiple SMRT cells, please run this script sepearately for each of them. Do not mix reads from different SMRT cells even though they come from the same sample. For the testing example, you can set pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" to let this parameter to be automatically set up based on the prefix parameter. ####################################### # process the pipeline diff --git a/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh b/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh index b2f9564..bae15a7 100755 --- a/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh +++ b/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh @@ -14,7 +14,7 @@ prefix="SK1" # The file name prefix for output files of the testing example. # process the pipeline echo "download the bam file from the ENA database ..." -wget $file_url +wget --no-check-certificate $file_url echo "bam2fastq ..." $bedtools_dir/bedtools bamtofastq -i $file_name -fq $prefix.filtered_subreads.fastq echo "gzip fastq ..." @@ -24,12 +24,12 @@ rm $file_name cd pacbio_fofn_files echo "download the metadata and raw PacBio reads in .h5 format ..." -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml if [[ ! -d Analysis_Results ]] then @@ -38,35 +38,35 @@ fi cd Analysis_Results -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 - -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 - -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 + +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 ### diff --git a/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh b/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh index fbcb282..d03f79c 100755 --- a/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh +++ b/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh @@ -23,13 +23,15 @@ prefix="SK1" # The file name prefix for the output files. Default = "SK1" for th threads=1 # The number of threads to use. Default = "1". ploidy=1 # The ploidy status of the sequenced genome. use "1" for haploid genome and "2" for diploid genome. Default = "1" for the testing example. +rounds_of_successive_polishing=1 # The number of total rounds of long-read-based assembly polishing. Default = "1" for the testing example. debug="no" # Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no" ########################################### # process the pipeline ln -s $input_assembly $prefix.assembly.raw.fa -$samtools_dir/samtools faidx $prefix.assembly.raw.fa +cp $prefix.assembly.raw.fa $prefix.assembly.tmp.fa + mkdir tmp if [[ $long_read_technology == "pacbio" ]] @@ -38,25 +40,42 @@ then source $miniconda2_dir/activate $conda_pacbio_dir/../../conda_pacbio_env if [[ $pacbio_reads_type == "RSII" ]] then - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam - if [[ $ploidy == "1" ]] - then - $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf - else - $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid - fi + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + $samtools_dir/samtools faidx $prefix.assembly.tmp.fa + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.tmp.fa $prefix.pbalign.round_${i}.bam + if [[ $ploidy == "1" ]] + then + $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf + else + $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf --diploid + fi + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.fa.fai + cp $prefix.assembly.consensus.round_${i}.fa $prefix.assembly.tmp.fa + rm $prefix.assembly.consensus.round_${i}.fq + rm $prefix.assembly.consensus.round_${i}.vcf + done else - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam - if [[ $ploidy == "1" ]] - then - $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf - else - $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid - fi + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + $samtools_dir/samtools faidx $prefix.assembly.tmp.fa + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.tmp.fa $prefix.pbalign.round_${i}.bam + if [[ $ploidy == "1" ]] + then + $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf + else + $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf --diploid + fi + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.fa.fai + cp $prefix.assembly.consensus.round_${i}.fa $prefix.assembly.tmp.fa + gzip $prefix.assembly.consensus.round_${i}.fq + gzip $prefix.assembly.consensus.round_${i}.vcf + done fi - gzip $prefix.assembly.consensus.fq - gzip $prefix.assembly.consensus.vcf - ln -s $prefix.assembly.consensus.fa $prefix.assembly.long_read_polished.fa + ln -s $prefix.assembly.consensus.round_${rounds_of_successive_polishing}.fa $prefix.assembly.long_read_polished.fa + rm $prefix.assembly.tmp.fa source $miniconda2_dir/deactivate else # perform correction using the minimap2-nanopolish pipeline @@ -67,16 +86,23 @@ else else $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_basecalling_sequencing_summary $long_reads_in_fastq fi - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=$prefix.assembly.raw.fa OUTPUT=$prefix.assembly.raw.dict - $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.raw.fa $long_reads_in_fastq > $prefix.minimap2.sam - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam INPUT=$prefix.minimap2.sam OUTPUT=$prefix.minimap2.bam SORT_ORDER=coordinate - $samtools_dir/samtools index $prefix.minimap2.bam - rm $prefix.minimap2.sam - python3 $nanopolish_dir/scripts/nanopolish_makerange.py $prefix.assembly.raw.fa | $parallel_dir/parallel --results ${prefix}_nanopolish_results -P 1 \ - $nanopolish_dir/nanopolish variants --consensus -o $prefix.polished.{1}.vcf -w {1} --ploidy $ploidy -r $long_reads_in_fastq -b $prefix.minimap2.bam -g $prefix.assembly.raw.fa -t $threads --min-candidate-frequency 0.2 || true - $nanopolish_dir/nanopolish vcf2fasta -g $prefix.assembly.raw.fa $prefix.polished.*.vcf > $prefix.assembly.nanopolish.fa - ln -s $prefix.assembly.nanopolish.fa $prefix.assembly.long_read_polished.fa - mv $prefix.polished.*.vcf ${prefix}_nanopolish_results + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary -REFERENCE $prefix.assembly.tmp.fa -OUTPUT $prefix.assembly.tmp.dict + $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.tmp.fa $long_reads_in_fastq > $prefix.minimap2.round_${i}.sam + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam -INPUT $prefix.minimap2.round_${i}.sam -OUTPUT $prefix.minimap2.round_${i}.bam -SORT_ORDER coordinate + $samtools_dir/samtools index $prefix.minimap2.round_${i}.bam + rm $prefix.minimap2.round_${i}.sam + python3 $nanopolish_dir/scripts/nanopolish_makerange.py $prefix.assembly.tmp.fa | $parallel_dir/parallel --results ${prefix}_nanopolish_round_${i}_results -P 1 \ + $nanopolish_dir/nanopolish variants --consensus -o $prefix.polished.{1}.vcf -w {1} --ploidy $ploidy -r $long_reads_in_fastq -b $prefix.minimap2.round_${i}.bam -g $prefix.assembly.tmp.fa -t $threads --min-candidate-frequency 0.2 || true + $nanopolish_dir/nanopolish vcf2fasta -g $prefix.assembly.tmp.fa $prefix.polished.*.vcf > $prefix.assembly.nanopolish.round_${i}.fa + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.dict + cp $prefix.assembly.nanopolish.round_${i}.fa $prefix.assembly.tmp.fa + mv $prefix.polished.*.vcf ${prefix}_nanopolish_round_${i}_results + done + ln -s $prefix.assembly.nanopolish.round_${rounds_of_successive_polishing}.fa $prefix.assembly.long_read_polished.fa + rm $prefix.assembly.tmp.fa fi rm -r tmp diff --git a/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh b/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh index e02c052..a14b0b8 100755 --- a/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh +++ b/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh @@ -9,6 +9,8 @@ source ./../../env.sh input_assembly="./../02.Long-read-based_Assembly_Polishing/SK1.assembly.long_read_polished.fa" # The file path of the input assembly before Illumina-based correction prefix="SK1" # The file name prefix for the output files. +trim_illumina_reads="yes" # Whether to trim the input Illumina reads. Use "yes" if prefer to perform trimming, otherwise use "no". Default = "yes". +rounds_of_successive_polishing=1 # The number of total rounds of Illumina-read-based assembly polishing. Default = "1" for the testing example. threads=1 # The number of threads to use. Default = "1". mode="PE" # Illumina sequencing mode, "PE" for paired-end sequencing and "SE" for single-end sequencing. Default = "PE". fixlist="snps,indels" # The types of errors for Illumina-read-based correction by Pilon; see Pilon's manual for more details. Default = "snps,indels". @@ -34,137 +36,147 @@ else ln -s $reads_SE raw.fq.gz; fi -ln -s $input_assembly refseq.fa - +cp $input_assembly refseq.tmp.fa cp $adapter adapter.fa mkdir tmp -if [[ $mode == "PE" ]] -then - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar PE -threads $threads -phred33 raw.R1.fq.gz raw.R2.fq.gz trimmed.R1.fq.gz trimmed.unpaired.R1.fq.gz trimmed.R2.fq.gz trimmed.unpaired.R2.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 - rm trimmed.unpaired.R1.fq.gz - rm trimmed.unpaired.R2.fq.gz -else - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar SE -threads $threads -phred33 raw.fq.gz trimmed.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 -fi - -# bwa mapping -$bwa_dir/bwa index refseq.fa -if [[ $mode == "PE" ]] +if [[ $trim_illumina_reads == "yes" ]] then - $bwa_dir/bwa mem -t $threads -M refseq.fa trimmed.R1.fq.gz trimmed.R2.fq.gz >$prefix.sam + if [[ $mode == "PE" ]] + then + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar PE -threads $threads -phred33 raw.R1.fq.gz raw.R2.fq.gz trimmed.R1.fq.gz trimmed.unpaired.R1.fq.gz trimmed.R2.fq.gz trimmed.unpaired.R2.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 + rm trimmed.unpaired.R1.fq.gz + rm trimmed.unpaired.R2.fq.gz + mv trimmed.R1.fq.gz clean.R1.fq.gz + mv trimmed.R2.fq.gz clean.R2.fq.gz + else + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar SE -threads $threads -phred33 raw.fq.gz trimmed.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 + mv trimmed.fq.gz clean.fq.gz + fi else - $bwa_dir/bwa mem -t $threads -M refseq.fa trimmed.fq.gz >$prefix.sam + if [[ $mode == "PE" ]] + then + cp raw.R1.fq.gz clean.R1.fq.gz + cp raw.R2.fq.gz clean.R2.fq.gz + else + cp raw.fq.gz clean.fq.gz + fi fi -# index reference sequence -$samtools_dir/samtools faidx refseq.fa -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ - -REFERENCE refseq.fa \ - -OUTPUT refseq.dict - -# sort bam file by picard-tools SortSam -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ - -INPUT $prefix.sam \ - -OUTPUT $prefix.sort.bam \ - -SORT_ORDER coordinate - -# fixmate -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ - -INPUT $prefix.sort.bam \ - -OUTPUT $prefix.fixmate.bam - -# add or replace read groups and sort -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ - -INPUT $prefix.fixmate.bam \ - -OUTPUT $prefix.rdgrp.bam \ - -SORT_ORDER coordinate \ - -RGID $prefix \ - -RGLB $prefix \ - -RGPL "Illumina" \ - -RGPU $prefix \ - -RGSM $prefix \ - -RGCN "RGCN" - -# remove duplicates -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ - -INPUT $prefix.rdgrp.bam \ - -REMOVE_DUPLICATES true \ - -METRICS_FILE $prefix.dedup.matrics \ - -OUTPUT $prefix.dedup.bam - -# index the dedup.bam file -$samtools_dir/samtools index $prefix.dedup.bam - -# GATK local realign -# find realigner targets -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ - -R refseq.fa \ - -T RealignerTargetCreator \ - -I $prefix.dedup.bam \ - -o $prefix.realn.intervals -# run realigner -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ - -R refseq.fa \ - -T IndelRealigner \ - -I $prefix.dedup.bam \ - -targetIntervals $prefix.realn.intervals \ - -o $prefix.realn.bam - -# index final bam file -$samtools_dir/samtools index $prefix.realn.bam - -# for PE sequencing -if [[ $mode == "PE" ]] -then - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ - --genome $input_assembly \ - --frags $prefix.realn.bam \ - --fix $fixlist \ - --vcf \ - --changes \ - --output $prefix.assembly.illumina_read_polished \ - >$prefix.log -else - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ - --genome $input_assembly \ - --unpaired $prefix.realn.bam \ - --fix $fixlist \ - --vcf \ - --changes \ - --output $prefix.assembly.illumina_read_polished \ - >$prefix.log -fi - -perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.illumina_read_polished.changes -mv $prefix.assembly.illumina_read_polished.fasta $prefix.assembly.illumina_read_polished.fa -gzip $prefix.assembly.illumina_read_polished.vcf - +# bwa mapping +for i in $(seq 1 1 $rounds_of_successive_polishing) +do + $bwa_dir/bwa index refseq.tmp.fa + + if [[ $mode == "PE" ]] + then + $bwa_dir/bwa mem -t $threads -M refseq.tmp.fa clean.R1.fq.gz clean.R2.fq.gz >$prefix.round_${i}.sam + else + $bwa_dir/bwa mem -t $threads -M refseq.tmp.fa clean.fq.gz >$prefix.round_${i}.sam + fi + + # index reference sequence + $samtools_dir/samtools faidx refseq.tmp.fa + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ + -REFERENCE refseq.tmp.fa \ + -OUTPUT refseq.tmp.dict + + # sort bam file by picard-tools SortSam + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ + -INPUT $prefix.round_${i}.sam \ + -OUTPUT $prefix.round_${i}.sort.bam \ + -SORT_ORDER coordinate + + # fixmate + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ + -INPUT $prefix.round_${i}.sort.bam \ + -OUTPUT $prefix.round_${i}.fixmate.bam + + # add or replace read groups and sort + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ + -INPUT $prefix.round_${i}.fixmate.bam \ + -OUTPUT $prefix.round_${i}.rdgrp.bam \ + -SORT_ORDER coordinate \ + -RGID $prefix \ + -RGLB $prefix \ + -RGPL "Illumina" \ + -RGPU $prefix \ + -RGSM $prefix \ + -RGCN "RGCN" + + # remove duplicates + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ + -INPUT $prefix.round_${i}.rdgrp.bam \ + -REMOVE_DUPLICATES true \ + -METRICS_FILE $prefix.round_${i}.dedup.matrics \ + -OUTPUT $prefix.round_${i}.dedup.bam + + # index the dedup.bam file + $samtools_dir/samtools index $prefix.round_${i}.dedup.bam + + # GATK local realign + # find realigner targets + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.tmp.fa \ + -T RealignerTargetCreator \ + -I $prefix.round_${i}.dedup.bam \ + -o $prefix.round_${i}.realn.intervals + # run realigner + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.tmp.fa \ + -T IndelRealigner \ + -I $prefix.round_${i}.dedup.bam \ + -targetIntervals $prefix.round_${i}.realn.intervals \ + -o $prefix.round_${i}.realn.bam + + # index final bam file + $samtools_dir/samtools index $prefix.round_${i}.realn.bam + + # for PE sequencing + if [[ $mode == "PE" ]] + then + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome refseq.tmp.fa \ + --frags $prefix.round_${i}.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.pilon.round_${i} \ + >$prefix.pilon.round_${i}.log + else + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome refseq.tmp.fa \ + --unpaired $prefix.round_${i}.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.pilon.round_${i} \ + >$prefix.pilon.round_${i}.log + fi + perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.pilon.round_${i}.changes + gzip $prefix.assembly.pilon.round_${i}.vcf + rm refseq.tmp.* + rm $prefix.round_${i}.sam + rm $prefix.round_${i}.sort.bam + rm $prefix.round_${i}.fixmate.bam + rm $prefix.round_${i}.rdgrp.bam + rm $prefix.round_${i}.dedup.bam + rm $prefix.round_${i}.dedup.matrics + rm $prefix.round_${i}.dedup.bam.bai + rm $prefix.round_${i}.realn.intervals + cp $prefix.assembly.pilon.round_${i}.fasta refseq.tmp.fa +done + +ln -s $prefix.assembly.pilon.round_${rounds_of_successive_polishing}.fasta $prefix.assembly.illumina_read_polished.fa +rm refseq.tmp.fa rm -r tmp # clean up intermediate files if [[ $debug == "no" ]] then rm adapter.fa - rm refseq.fa - rm refseq.fa.fai - rm refseq.dict - rm refseq.fa.bwt - rm refseq.fa.pac - rm refseq.fa.ann - rm refseq.fa.amb - rm refseq.fa.sa - rm $prefix.sam - rm $prefix.sort.bam - rm $prefix.fixmate.bam - rm $prefix.rdgrp.bam - rm $prefix.dedup.bam - rm $prefix.dedup.matrics - rm $prefix.dedup.bam.bai - rm $prefix.realn.intervals rm *.fq.gz - rm $prefix.realn.bam.bai fi diff --git a/README.md b/README.md index 1b215a4..9bf26ce 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Jia-Xing Yue & Gianni Liti. (2018) Long-read sequencing data analysis for yeasts Jia-Xing Yue, Jing Li, Louise Aigrain, Johan Hallin, Karl Persson, Karen Oliver, Anders Bergström, Paul Coupland, Jonas Warringer, Marco Cosentino Lagomarsino, Gilles Fischer, Richard Durbin, Gianni Liti. (2017) Contrasting evolutionary genome dynamics between domesticated and wild yeasts. *Nature Genetics*, 49:913-924. ## Release history +* v1.4.0 Released on 2019/03/21 * v1.3.1 Released on 2019/01/22 * v1.3.0 Released on 2018/11/13 * v1.2.0 Released on 2018/10/15 @@ -49,7 +50,7 @@ This protocol is designed for a desktop or computing server running an x86-64-bi * Tar (https://www.gnu.org/software/tar/) * Unzip (http://infozip.sourceforge.net/UnZip.html) * Virtualenv v15.1.0 or newer (https://virtualenv.pypa.io) -* Wget (https://www.gnu.org/software/wget/) +* Wget v1.14 or newer (https://www.gnu.org/software/wget/) * Zlib (https://zlib.net/) diff --git a/data/Proteome_DB_for_annotation.CDhit_I95.fa.gz b/data/Proteome_DB_for_annotation.CDhit_I95.fa.gz index fc07803..bd079d2 100644 Binary files a/data/Proteome_DB_for_annotation.CDhit_I95.fa.gz and b/data/Proteome_DB_for_annotation.CDhit_I95.fa.gz differ diff --git a/data/SGDref.PoFF.faa.gz b/data/SGDref.PoFF.faa.gz index f4aa381..9ecf6fe 100644 Binary files a/data/SGDref.PoFF.faa.gz and b/data/SGDref.PoFF.faa.gz differ diff --git a/data/SGDref.PoFF.ffn.gz b/data/SGDref.PoFF.ffn.gz index aeba03d..b93c93f 100644 Binary files a/data/SGDref.PoFF.ffn.gz and b/data/SGDref.PoFF.ffn.gz differ diff --git a/data/te_proteins.fasta.gz b/data/te_proteins.fasta.gz index ecec9eb..d62689b 100644 Binary files a/data/te_proteins.fasta.gz and b/data/te_proteins.fasta.gz differ diff --git a/install_dependencies.sh b/install_dependencies.sh index 66319f5..015c579 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -1,5 +1,5 @@ #!/bin/bash -# last update: 2019/02/28 +# last update: 2019/03/07 set -e -o pipefail @@ -11,15 +11,16 @@ PORECHOP_VERSION="0.2.4" # PORECHOP_GITHUB_COMMIT_VERSION="109e437" # committed on 2018.10.19 FILTLONG_VERSION="0.2.0" # FILTLONG_GITHUB_COMMIT_VERSION="d1bb46d" # committed on 2018.05.11 -MINIMAP2_VERSION="2.13" # released on 2018.10.11 +MINIMAP2_VERSION="2.16" # released on 2019.2.28 CANU_VERSION="1.8" # released on 2018.10.23 -FLYE_VERSION="2.4" # released on 2018.01.15 +FLYE_VERSION="2.4.1" # released on 2019.03.07 WTDBG2_VERSION="2.3" # -WTDBG2_GITHUB_COMMIT_VERSION="4a39621" # committed on 2019.01.15 +WTDBG2_GITHUB_COMMIT_VERSION="59a39a6" # committed on 2019.03.06 SMARTDENOVO_VERSION="" # SMARTDENOVO_GITHUB_COMMIT_VERSION="5cc1356" # committed on 2018.02.19 RAGOUT_VERSION="2.1.1" # released on 2018.07.30 -#QUAST_VERSION="5.0.1" # one of its dependency needs "csh" to be pre-installed +# GUPPY_VERSION="2.3.5" # released on 2019.02.26 +# QUAST_VERSION="5.0.1" # one of its dependency needs "csh" to be pre-installed HDF_VERSION="1.10.1" # SONLIB_VERSION="" # SONLIB_GITHUB_COMMIT_VERSION="1afbd97" # committed on 2017.08.09 @@ -34,10 +35,10 @@ CAP_VERSION="" # see http://seq.cs.iastate.edu/cap3.html BWA_VERSION="0.7.17" # released on 2017.10.23 SAMTOOLS_VERSION="1.9" # released on 2018.07.18 CIRCLATOR_VERSION="1.5.5" # released on 2018.01.31 -TRIMMOMATIC_VERSION="0.36" # +TRIMMOMATIC_VERSION="0.38" # GATK_VERSION="3.6-6" # -PICARD_VERSION="2.18.15" # released on 2018.10.22 -PILON_VERSION="1.22" # released on 2017.03.15 +PICARD_VERSION="2.18.23" # released on 2019.02.25 +PILON_VERSION="1.23" # released on 2018.11.27 EXONERATE_VERSION="2.2.0" # BLAST_VERSION="2.2.31" # RMBLAST_VERSION="2.2.28" # @@ -62,7 +63,7 @@ MINICONDA2_VERSION="4.5.11" # PB_ASSEMBLY_VERSION="0.0.2" # BAX2BAM_VERSION="0.0.9" # NANOPOLISH_VERSION="0.11.0" # -NANOPOLISH_GITHUB_COMMIT_VERSION="3180474" # commited on 2019.01.18 +NANOPOLISH_GITHUB_COMMIT_VERSION="8b5e605" # commited on 2019.02.22 PARALLEL_VERSION="20180722" # released on 2018.07.22 # for MFannot EMBOSS_VERSION="6.5.7" # released on 2012.07.25 @@ -86,11 +87,11 @@ PORECHOP_DOWNLOAD_URL="https://github.com/rrwick/Porechop.git" FILTLONG_DOWNLOAD_URL="https://github.com/rrwick/Filtlong.git" MINIMAP2_DOWNLOAD_URL="https://github.com/lh3/minimap2/releases/download/v${MINIMAP2_VERSION}/minimap2-${MINIMAP2_VERSION}_x64-linux.tar.bz2" CANU_DOWNLOAD_URL="https://github.com/marbl/canu/releases/download/v${CANU_VERSION}/canu-${CANU_VERSION}.Linux-amd64.tar.xz" -#CANU_DOWNLOAD_URL="https://github.com/marbl/canu/archive/v${CANU_VERSION}.tar.gz" FLYE_DOWNLOAD_URL="https://github.com/fenderglass/Flye/archive/${FLYE_VERSION}.tar.gz" WTDBG2_DOWNLOAD_URL="https://github.com/ruanjue/wtdbg2.git" SMARTDENOVO_DOWNLOAD_URL="https://github.com/ruanjue/smartdenovo" -#QUAST_DOWNLOAD_URL="https://downloads.sourceforge.net/project/quast/quast-${QUAST_VERSION}.tar.gz" +# GUPPY_DOWNLOAD_URL="https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_${GUPPY_VERSION}_linux64.tar.gz" +# QUAST_DOWNLOAD_URL="https://downloads.sourceforge.net/project/quast/quast-${QUAST_VERSION}.tar.gz" RAGOUT_DOWNLOAD_URL="https://github.com/fenderglass/Ragout/archive/${RAGOUT_VERSION}.tar.gz" HDF_VERSION_prefix=${HDF_VERSION%.*} HDF_DOWNLOAD_URL="https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-${HDF_VERSION_prefix}/hdf5-${HDF_VERSION}/src/hdf5-${HDF_VERSION}.tar.gz" @@ -281,20 +282,6 @@ ln -s $minimap2_dir/minimap2 . cd $build_dir rm canu-${CANU_VERSION}.tar.xz -# cd $build_dir -# echo "Download Canu-v${CANU_VERSION}" -# download $CANU_DOWNLOAD_URL "canu-${CANU_VERSION}.tar.gz" -# tar -xzf canu-${CANU_VERSION}.tar.gz -# canu_dir="$build_dir/canu-${CANU_VERSION}" -# cd $canu_dir -# cd src -# make -j 8 -# canu_dir="$build_dir/canu-${CANU_VERSION}/Linux-amd64/bin" -# cd $canu_dir -# ln -s $minimap2_dir/minimap2 . -# cd $build_dir -# rm canu-${CANU_VERSION}.tar.gz - # ------------- Flye ------------------- cd $build_dir echo "Download Flye-v${FLYE_VERSION}" @@ -401,6 +388,14 @@ cd $build_dir rm gnuplot-${GNUPLOT_VERSION}.tar.gz PATH="$gnuplot_dir:${PATH}" +# # --------------- Guppy -------------------- +# cd $build_dir +# echo "Download Guppy-v${GUPPY_VERSION}" +# download $GUPPY_DOWNLOAD_URL "ont-guppy-cpu_${GUPPY_VERSION}_linux64.tar.gz" +# tar -xzf ont-guppy-cpu_${GUPPY_VERSION}_linux64.tar.gz +# guppy_dir="$build_dir/ont-guppy-cpu/bin" +# rm "ont-guppy-cpu_${GUPPY_VERSION}_linux64.tar.gz" + # # ------------- QUAST -------------------- # cd $build_dir # echo "Download QUAST-v${QUAST_VERSION}" @@ -792,6 +787,8 @@ $miniconda2_dir/conda config --add channels bioconda $miniconda2_dir/conda config --add channels conda-forge $miniconda2_dir/conda create -y -p $build_dir/conda_pacbio_env source $miniconda2_dir/activate $build_dir/conda_pacbio_env +$miniconda2_dir/conda install -y hdf5=${HDF_VERSION} +$miniconda2_dir/conda install -y -c bioconda samtools=${SAMTOOLS_VERSION} openssl=1.0 $miniconda2_dir/conda install -y -c bioconda pb-assembly=${PB_ASSEMBLY_VERSION} $miniconda2_dir/conda install -y -c bioconda bax2bam=${BAX2BAM_VERSION} source $miniconda2_dir/deactivate @@ -868,7 +865,7 @@ download $PIROBJECT_DOWNLOAD_URL "pirobject_v${PIROBJECT_VERSION}.tar.gz" tar -zxf pirobject_v${PIROBJECT_VERSION}.tar.gz cd PirObject-${PIROBJECT_VERSION} pirobject_dir="$build_dir/PirObject-${PIROBJECT_VERSION}" -ln -s ./../lib/PirObject.pm . +ln -s ./lib/PirObject.pm . cd $build_dir rm pirobject_v${PIROBJECT_VERSION}.tar.gz @@ -880,7 +877,7 @@ cd PirModels git checkout -f -q $PIRMODELS_GITHUB_COMMIT_VERSION cd .. cp -r PirModels $pirobject_dir -pirmodels_dir="$perlobject_dir/PirModels" +pirmodels_dir="$pirobject_dir/PirModels" # --------------- Flip ------------------ cd $build_dir @@ -966,7 +963,8 @@ echo "export canu_dir=${canu_dir}" >> env.sh echo "export flye_dir=${flye_dir}" >> env.sh echo "export wtdbg2_dir=${wtdbg2_dir}" >> env.sh echo "export smartdenovo_dir=${smartdenovo_dir}" >> env.sh -#echo "export quast_dir=${quast_dir}" >> env.sh +# echo "export guppy_dir=${guppy_dir}" >> env.sh +# echo "export quast_dir=${quast_dir}" >> env.sh echo "export ragout_dir=${ragout_dir}" >> env.sh echo "export hdf_dir=${hdf_dir}" >> env.sh echo "export h5prefix=${h5prefix}" >> env.sh diff --git a/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh b/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh index 82d5682..2fdbe69 100755 --- a/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh +++ b/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh @@ -12,27 +12,31 @@ reads="./../00.Long_Reads/YGL3210.fq.gz" # The file path of the long reads file reads_type="nanopore-raw" # The long reads data type: "pacbio-raw" or "pacbio-corrected" or "nanopore-raw" or "nanopore-corrected". run_filtering="yes" # Whether to filter the reads: "yes" or "no". Default = "yes". genome_size="12500000" # The haploid genome size (in bp) of sequenced organism. Default = "12500000" (i.e. 12.5 Mb for the budding yeast S. cereviaie genome). This is used to calculate targeted sequencing coverage after read filtering (see below). -post_filtering_coverage="30" # Targeted sequencing coverage after read filtering. Default = "30" (i.e. 30x coverage). +post_filtering_coverage="40" # Targeted sequencing coverage after read filtering. Default = "40" (i.e. 40x coverage). threads=1 # The number of threads to use. Default = "1". ####################################### # process the pipeline -filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) -echo "" -echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" -echo "" if [[ "$reads_type" == "nanopore-raw" || "$reads_type" == "nanopore-corrected" ]] then $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --discard_middle --threads $threads > $prefix.porechop.summary.txt if [[ "$run_filtering" == "yes" ]] then - $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz + filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) + echo "" + echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" + echo "" + $filtlong_dir/filtlong --min_length 1000 --mean_q_weight 10 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz fi else if [[ "$run_filtering" == "yes" ]] then - $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $reads | gzip > $prefix.filtlong.fastq.gz + filtlong_target_bases=$(($genome_size * $post_filtering_coverage)) + echo "" + echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage, filtlong_target_bases=$filtlong_target_bases" + echo "" + $filtlong_dir/filtlong --min_length 1000 --mean_q_weight 10 --target_bases $filtlong_target_bases $reads | gzip > $prefix.filtlong.fastq.gz fi fi diff --git a/pipelines/LRSDAY.00.PacBio.RSII_bax2bam.sh b/pipelines/LRSDAY.00.PacBio.RSII_bax2bam.sh index 48a1ee8..3e7671e 100755 --- a/pipelines/LRSDAY.00.PacBio.RSII_bax2bam.sh +++ b/pipelines/LRSDAY.00.PacBio.RSII_bax2bam.sh @@ -8,7 +8,7 @@ source ./../../env.sh # set project-specific variables prefix="SK1.SMRTCell.1" # The file name prefix for the output files. For the testing example, run this script four times with the prefix of "SK1.SMRTCell.1", "SK1.SMRTCell.2", "SK1.SMRTCell.3", and "SK1.SMRTCell.4" respectively. -pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/SK1.SMRTCell.1.RSII_bax.fofn" # The fofn file containing file paths to the PacBio RSII bax reads from the same SMRT cell. If you have data from multiple SMRT cells, please run this script sepearately for each of them. Do not mix reads from different SMRT cells even though they come from the same sample. For the testing example, you can set pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" to let this parameter to be automatically set up based on the prefix parameter. +pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" # The fofn file containing file paths to the PacBio RSII bax reads from the same SMRT cell. If you have data from multiple SMRT cells, please run this script sepearately for each of them. Do not mix reads from different SMRT cells even though they come from the same sample. For the testing example, you can set pacbio_RSII_bax_fofn_file="./pacbio_fofn_files/$prefix.RSII_bax.fofn" to let this parameter to be automatically set up based on the prefix parameter. ####################################### # process the pipeline diff --git a/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh b/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh index b2f9564..bae15a7 100755 --- a/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh +++ b/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh @@ -14,7 +14,7 @@ prefix="SK1" # The file name prefix for output files of the testing example. # process the pipeline echo "download the bam file from the ENA database ..." -wget $file_url +wget --no-check-certificate $file_url echo "bam2fastq ..." $bedtools_dir/bedtools bamtofastq -i $file_name -fq $prefix.filtered_subreads.fastq echo "gzip fastq ..." @@ -24,12 +24,12 @@ rm $file_name cd pacbio_fofn_files echo "download the metadata and raw PacBio reads in .h5 format ..." -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml if [[ ! -d Analysis_Results ]] then @@ -38,35 +38,35 @@ fi cd Analysis_Results -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 - -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 - -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 -# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 + +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 +# wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 + +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 +wget --no-check-certificate ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 ### diff --git a/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh b/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh index fbcb282..d03f79c 100755 --- a/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh +++ b/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh @@ -23,13 +23,15 @@ prefix="SK1" # The file name prefix for the output files. Default = "SK1" for th threads=1 # The number of threads to use. Default = "1". ploidy=1 # The ploidy status of the sequenced genome. use "1" for haploid genome and "2" for diploid genome. Default = "1" for the testing example. +rounds_of_successive_polishing=1 # The number of total rounds of long-read-based assembly polishing. Default = "1" for the testing example. debug="no" # Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no" ########################################### # process the pipeline ln -s $input_assembly $prefix.assembly.raw.fa -$samtools_dir/samtools faidx $prefix.assembly.raw.fa +cp $prefix.assembly.raw.fa $prefix.assembly.tmp.fa + mkdir tmp if [[ $long_read_technology == "pacbio" ]] @@ -38,25 +40,42 @@ then source $miniconda2_dir/activate $conda_pacbio_dir/../../conda_pacbio_env if [[ $pacbio_reads_type == "RSII" ]] then - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam - if [[ $ploidy == "1" ]] - then - $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf - else - $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid - fi + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + $samtools_dir/samtools faidx $prefix.assembly.tmp.fa + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.tmp.fa $prefix.pbalign.round_${i}.bam + if [[ $ploidy == "1" ]] + then + $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf + else + $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf --diploid + fi + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.fa.fai + cp $prefix.assembly.consensus.round_${i}.fa $prefix.assembly.tmp.fa + rm $prefix.assembly.consensus.round_${i}.fq + rm $prefix.assembly.consensus.round_${i}.vcf + done else - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam - if [[ $ploidy == "1" ]] - then - $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf - else - $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid - fi + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + $samtools_dir/samtools faidx $prefix.assembly.tmp.fa + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.tmp.fa $prefix.pbalign.round_${i}.bam + if [[ $ploidy == "1" ]] + then + $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf + else + $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.round_${i}.bam -r $prefix.assembly.tmp.fa -o $prefix.assembly.consensus.round_${i}.fa -o $prefix.assembly.consensus.round_${i}.fq -o $prefix.assembly.consensus.round_${i}.vcf --diploid + fi + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.fa.fai + cp $prefix.assembly.consensus.round_${i}.fa $prefix.assembly.tmp.fa + gzip $prefix.assembly.consensus.round_${i}.fq + gzip $prefix.assembly.consensus.round_${i}.vcf + done fi - gzip $prefix.assembly.consensus.fq - gzip $prefix.assembly.consensus.vcf - ln -s $prefix.assembly.consensus.fa $prefix.assembly.long_read_polished.fa + ln -s $prefix.assembly.consensus.round_${rounds_of_successive_polishing}.fa $prefix.assembly.long_read_polished.fa + rm $prefix.assembly.tmp.fa source $miniconda2_dir/deactivate else # perform correction using the minimap2-nanopolish pipeline @@ -67,16 +86,23 @@ else else $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_basecalling_sequencing_summary $long_reads_in_fastq fi - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=$prefix.assembly.raw.fa OUTPUT=$prefix.assembly.raw.dict - $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.raw.fa $long_reads_in_fastq > $prefix.minimap2.sam - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam INPUT=$prefix.minimap2.sam OUTPUT=$prefix.minimap2.bam SORT_ORDER=coordinate - $samtools_dir/samtools index $prefix.minimap2.bam - rm $prefix.minimap2.sam - python3 $nanopolish_dir/scripts/nanopolish_makerange.py $prefix.assembly.raw.fa | $parallel_dir/parallel --results ${prefix}_nanopolish_results -P 1 \ - $nanopolish_dir/nanopolish variants --consensus -o $prefix.polished.{1}.vcf -w {1} --ploidy $ploidy -r $long_reads_in_fastq -b $prefix.minimap2.bam -g $prefix.assembly.raw.fa -t $threads --min-candidate-frequency 0.2 || true - $nanopolish_dir/nanopolish vcf2fasta -g $prefix.assembly.raw.fa $prefix.polished.*.vcf > $prefix.assembly.nanopolish.fa - ln -s $prefix.assembly.nanopolish.fa $prefix.assembly.long_read_polished.fa - mv $prefix.polished.*.vcf ${prefix}_nanopolish_results + for i in $(seq 1 1 $rounds_of_successive_polishing) + do + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary -REFERENCE $prefix.assembly.tmp.fa -OUTPUT $prefix.assembly.tmp.dict + $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.tmp.fa $long_reads_in_fastq > $prefix.minimap2.round_${i}.sam + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam -INPUT $prefix.minimap2.round_${i}.sam -OUTPUT $prefix.minimap2.round_${i}.bam -SORT_ORDER coordinate + $samtools_dir/samtools index $prefix.minimap2.round_${i}.bam + rm $prefix.minimap2.round_${i}.sam + python3 $nanopolish_dir/scripts/nanopolish_makerange.py $prefix.assembly.tmp.fa | $parallel_dir/parallel --results ${prefix}_nanopolish_round_${i}_results -P 1 \ + $nanopolish_dir/nanopolish variants --consensus -o $prefix.polished.{1}.vcf -w {1} --ploidy $ploidy -r $long_reads_in_fastq -b $prefix.minimap2.round_${i}.bam -g $prefix.assembly.tmp.fa -t $threads --min-candidate-frequency 0.2 || true + $nanopolish_dir/nanopolish vcf2fasta -g $prefix.assembly.tmp.fa $prefix.polished.*.vcf > $prefix.assembly.nanopolish.round_${i}.fa + rm $prefix.assembly.tmp.fa + rm $prefix.assembly.tmp.dict + cp $prefix.assembly.nanopolish.round_${i}.fa $prefix.assembly.tmp.fa + mv $prefix.polished.*.vcf ${prefix}_nanopolish_round_${i}_results + done + ln -s $prefix.assembly.nanopolish.round_${rounds_of_successive_polishing}.fa $prefix.assembly.long_read_polished.fa + rm $prefix.assembly.tmp.fa fi rm -r tmp diff --git a/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh b/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh index e02c052..a14b0b8 100755 --- a/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh +++ b/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh @@ -9,6 +9,8 @@ source ./../../env.sh input_assembly="./../02.Long-read-based_Assembly_Polishing/SK1.assembly.long_read_polished.fa" # The file path of the input assembly before Illumina-based correction prefix="SK1" # The file name prefix for the output files. +trim_illumina_reads="yes" # Whether to trim the input Illumina reads. Use "yes" if prefer to perform trimming, otherwise use "no". Default = "yes". +rounds_of_successive_polishing=1 # The number of total rounds of Illumina-read-based assembly polishing. Default = "1" for the testing example. threads=1 # The number of threads to use. Default = "1". mode="PE" # Illumina sequencing mode, "PE" for paired-end sequencing and "SE" for single-end sequencing. Default = "PE". fixlist="snps,indels" # The types of errors for Illumina-read-based correction by Pilon; see Pilon's manual for more details. Default = "snps,indels". @@ -34,137 +36,147 @@ else ln -s $reads_SE raw.fq.gz; fi -ln -s $input_assembly refseq.fa - +cp $input_assembly refseq.tmp.fa cp $adapter adapter.fa mkdir tmp -if [[ $mode == "PE" ]] -then - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar PE -threads $threads -phred33 raw.R1.fq.gz raw.R2.fq.gz trimmed.R1.fq.gz trimmed.unpaired.R1.fq.gz trimmed.R2.fq.gz trimmed.unpaired.R2.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 - rm trimmed.unpaired.R1.fq.gz - rm trimmed.unpaired.R2.fq.gz -else - java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar SE -threads $threads -phred33 raw.fq.gz trimmed.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 -fi - -# bwa mapping -$bwa_dir/bwa index refseq.fa -if [[ $mode == "PE" ]] +if [[ $trim_illumina_reads == "yes" ]] then - $bwa_dir/bwa mem -t $threads -M refseq.fa trimmed.R1.fq.gz trimmed.R2.fq.gz >$prefix.sam + if [[ $mode == "PE" ]] + then + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar PE -threads $threads -phred33 raw.R1.fq.gz raw.R2.fq.gz trimmed.R1.fq.gz trimmed.unpaired.R1.fq.gz trimmed.R2.fq.gz trimmed.unpaired.R2.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 + rm trimmed.unpaired.R1.fq.gz + rm trimmed.unpaired.R2.fq.gz + mv trimmed.R1.fq.gz clean.R1.fq.gz + mv trimmed.R2.fq.gz clean.R2.fq.gz + else + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $trimmomatic_dir/trimmomatic.jar SE -threads $threads -phred33 raw.fq.gz trimmed.fq.gz ILLUMINACLIP:adapter.fa:2:30:10 SLIDINGWINDOW:5:20 MINLEN:36 + mv trimmed.fq.gz clean.fq.gz + fi else - $bwa_dir/bwa mem -t $threads -M refseq.fa trimmed.fq.gz >$prefix.sam + if [[ $mode == "PE" ]] + then + cp raw.R1.fq.gz clean.R1.fq.gz + cp raw.R2.fq.gz clean.R2.fq.gz + else + cp raw.fq.gz clean.fq.gz + fi fi -# index reference sequence -$samtools_dir/samtools faidx refseq.fa -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ - -REFERENCE refseq.fa \ - -OUTPUT refseq.dict - -# sort bam file by picard-tools SortSam -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ - -INPUT $prefix.sam \ - -OUTPUT $prefix.sort.bam \ - -SORT_ORDER coordinate - -# fixmate -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ - -INPUT $prefix.sort.bam \ - -OUTPUT $prefix.fixmate.bam - -# add or replace read groups and sort -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ - -INPUT $prefix.fixmate.bam \ - -OUTPUT $prefix.rdgrp.bam \ - -SORT_ORDER coordinate \ - -RGID $prefix \ - -RGLB $prefix \ - -RGPL "Illumina" \ - -RGPU $prefix \ - -RGSM $prefix \ - -RGCN "RGCN" - -# remove duplicates -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ - -INPUT $prefix.rdgrp.bam \ - -REMOVE_DUPLICATES true \ - -METRICS_FILE $prefix.dedup.matrics \ - -OUTPUT $prefix.dedup.bam - -# index the dedup.bam file -$samtools_dir/samtools index $prefix.dedup.bam - -# GATK local realign -# find realigner targets -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ - -R refseq.fa \ - -T RealignerTargetCreator \ - -I $prefix.dedup.bam \ - -o $prefix.realn.intervals -# run realigner -java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ - -R refseq.fa \ - -T IndelRealigner \ - -I $prefix.dedup.bam \ - -targetIntervals $prefix.realn.intervals \ - -o $prefix.realn.bam - -# index final bam file -$samtools_dir/samtools index $prefix.realn.bam - -# for PE sequencing -if [[ $mode == "PE" ]] -then - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ - --genome $input_assembly \ - --frags $prefix.realn.bam \ - --fix $fixlist \ - --vcf \ - --changes \ - --output $prefix.assembly.illumina_read_polished \ - >$prefix.log -else - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ - --genome $input_assembly \ - --unpaired $prefix.realn.bam \ - --fix $fixlist \ - --vcf \ - --changes \ - --output $prefix.assembly.illumina_read_polished \ - >$prefix.log -fi - -perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.illumina_read_polished.changes -mv $prefix.assembly.illumina_read_polished.fasta $prefix.assembly.illumina_read_polished.fa -gzip $prefix.assembly.illumina_read_polished.vcf - +# bwa mapping +for i in $(seq 1 1 $rounds_of_successive_polishing) +do + $bwa_dir/bwa index refseq.tmp.fa + + if [[ $mode == "PE" ]] + then + $bwa_dir/bwa mem -t $threads -M refseq.tmp.fa clean.R1.fq.gz clean.R2.fq.gz >$prefix.round_${i}.sam + else + $bwa_dir/bwa mem -t $threads -M refseq.tmp.fa clean.fq.gz >$prefix.round_${i}.sam + fi + + # index reference sequence + $samtools_dir/samtools faidx refseq.tmp.fa + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ + -REFERENCE refseq.tmp.fa \ + -OUTPUT refseq.tmp.dict + + # sort bam file by picard-tools SortSam + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ + -INPUT $prefix.round_${i}.sam \ + -OUTPUT $prefix.round_${i}.sort.bam \ + -SORT_ORDER coordinate + + # fixmate + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ + -INPUT $prefix.round_${i}.sort.bam \ + -OUTPUT $prefix.round_${i}.fixmate.bam + + # add or replace read groups and sort + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ + -INPUT $prefix.round_${i}.fixmate.bam \ + -OUTPUT $prefix.round_${i}.rdgrp.bam \ + -SORT_ORDER coordinate \ + -RGID $prefix \ + -RGLB $prefix \ + -RGPL "Illumina" \ + -RGPU $prefix \ + -RGSM $prefix \ + -RGCN "RGCN" + + # remove duplicates + java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ + -INPUT $prefix.round_${i}.rdgrp.bam \ + -REMOVE_DUPLICATES true \ + -METRICS_FILE $prefix.round_${i}.dedup.matrics \ + -OUTPUT $prefix.round_${i}.dedup.bam + + # index the dedup.bam file + $samtools_dir/samtools index $prefix.round_${i}.dedup.bam + + # GATK local realign + # find realigner targets + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.tmp.fa \ + -T RealignerTargetCreator \ + -I $prefix.round_${i}.dedup.bam \ + -o $prefix.round_${i}.realn.intervals + # run realigner + java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.tmp.fa \ + -T IndelRealigner \ + -I $prefix.round_${i}.dedup.bam \ + -targetIntervals $prefix.round_${i}.realn.intervals \ + -o $prefix.round_${i}.realn.bam + + # index final bam file + $samtools_dir/samtools index $prefix.round_${i}.realn.bam + + # for PE sequencing + if [[ $mode == "PE" ]] + then + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome refseq.tmp.fa \ + --frags $prefix.round_${i}.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.pilon.round_${i} \ + >$prefix.pilon.round_${i}.log + else + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome refseq.tmp.fa \ + --unpaired $prefix.round_${i}.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.pilon.round_${i} \ + >$prefix.pilon.round_${i}.log + fi + perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.pilon.round_${i}.changes + gzip $prefix.assembly.pilon.round_${i}.vcf + rm refseq.tmp.* + rm $prefix.round_${i}.sam + rm $prefix.round_${i}.sort.bam + rm $prefix.round_${i}.fixmate.bam + rm $prefix.round_${i}.rdgrp.bam + rm $prefix.round_${i}.dedup.bam + rm $prefix.round_${i}.dedup.matrics + rm $prefix.round_${i}.dedup.bam.bai + rm $prefix.round_${i}.realn.intervals + cp $prefix.assembly.pilon.round_${i}.fasta refseq.tmp.fa +done + +ln -s $prefix.assembly.pilon.round_${rounds_of_successive_polishing}.fasta $prefix.assembly.illumina_read_polished.fa +rm refseq.tmp.fa rm -r tmp # clean up intermediate files if [[ $debug == "no" ]] then rm adapter.fa - rm refseq.fa - rm refseq.fa.fai - rm refseq.dict - rm refseq.fa.bwt - rm refseq.fa.pac - rm refseq.fa.ann - rm refseq.fa.amb - rm refseq.fa.sa - rm $prefix.sam - rm $prefix.sort.bam - rm $prefix.fixmate.bam - rm $prefix.rdgrp.bam - rm $prefix.dedup.bam - rm $prefix.dedup.matrics - rm $prefix.dedup.bam.bai - rm $prefix.realn.intervals rm *.fq.gz - rm $prefix.realn.bam.bai fi diff --git a/scripts/run_guppy_basecalling.sh b/scripts/run_guppy_basecalling.sh new file mode 100755 index 0000000..6b94087 --- /dev/null +++ b/scripts/run_guppy_basecalling.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# last update: 2019.03.07 + +source ./../env.sh +guppy_dir="$LRSDAY_HOME/build/ont-guppy-cpu/bin" + +flowcell_version="FLO-MIN106" # Please check your own flowcell version +sequencing_kit_version="SQK-LSK108" # Please check your own flowcell_kit_version +barcode_kit_version="EXP-NBD104" # Please check your own barcode_kit_version + +project_name="Project_YGL3210" # LRSDAY Project name +raw_reads_directory="$LRSDAY_HOME/$project_name/00.Long_Reads/RAW" # The directory containing the raw nanopore reads before basecalling +basecalling_output_directory="$LRSDAY_HOME/$project_name/00.Long_Reads/Guppy_Basecalling_out" +demultiplexing_output_directory="$LRSDAY_HOME/$project_name/00.Long_Reads/Guppy_Demultiplexing_out" + +qual=5 # read quality filter for guppy basecalling +num_callers=1 # num_callers for guppy +threads_per_caller=1 # threads_per_caller for guppy +demultiplexing_threads=1 # threads to use for demultiplexing + +$guppy_dir/guppy_basecaller \ + --flowcell $flowcell_version \ + --kit $sequencing_kit_version \ + --recursive \ + --input_path $raw_reads_directory \ + --save_path $basecalling_output_directory \ + --fast5_out \ + --qscore_filtering \ + --min_qscore $qual \ + --num_callers $num_callers \ + --cpu_threads_per_caller $threads_per_caller + + +cd $basecalling_output_directory +cat ./pass/*.fastq |gzip -c > $project_name.basecalled_reads.Q${qual}.pass.fastq.gz +cat ./fail/*.fastq |gzip -c > $project_name.basecalled_reads.Q${qual}.fail.fastq.gz + +$guppy_dir/guppy_barcoder \ + --barcode_kit $barcode_kit_version \ + --recursive \ + --input_path $basecalling_output_directory/pass \ + --save_path $demultiplexing_output_directory \ + --worker_threads $demultiplexing_threads + + +cd $demultiplexing_output_directory +for b in barcode* +do + echo "barcode=$b" + cat ./$b/*.fastq |gzip -c $project_name.basecalled_reads.Q${qual}.pass.$b.fastq.gz +done + +cat ./unclassified/*.fastq |gzip -c $project_name.basecalled_reads.Q${qual}.pass.$b.fastq.gz + + + + +