diff --git a/CHANGELOG.md b/CHANGELOG.md index 9451ef6..dfc2a0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +## [1.3.1] - 2019-01-22 +### Added +- A script for generated demultiplexed fastq reads based on nanopore's guppy demultiplexing summary file. +- A script for extracting all read IDs from the input fastq file. +- A script for calculating the sequence length of each enclosed sequences from the input fasta file. +### Changed +- Software version updates for a number of dependencies. Importantly, nanopolish was bumped up to v0.11.0 to support the new multi-fast5 nanopore reads. +- Downloading URL updates for testing data due to changes on the EBI data server. +- Adopting picard tools' new commandline syntax. +### Fixed +- A bug that prevents correct parameter parsing when multiple customized canu parameters are specified. +- Minor bugs related with deleting intermediate files in special cases. + ## [1.3.0] - 2018-11-13 ### Added - Support for one more alternative assembler: wtdbg2. diff --git a/Manual.pdf b/Manual.pdf index 33fed57..e8306c3 100644 Binary files a/Manual.pdf and b/Manual.pdf differ diff --git a/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh b/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh index 51464bf..82d5682 100755 --- a/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh +++ b/Project_Template/00.Long_Reads/LRSDAY.00.Long_Reads_Preprocessing.sh @@ -24,7 +24,7 @@ echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage echo "" if [[ "$reads_type" == "nanopore-raw" || "$reads_type" == "nanopore-corrected" ]] then - $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --threads $threads > $prefix.porechop.summary.txt + $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --discard_middle --threads $threads > $prefix.porechop.summary.txt if [[ "$run_filtering" == "yes" ]] then $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz diff --git a/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh b/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh index 3ceac14..b2f9564 100755 --- a/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh +++ b/Project_Template/00.Long_Reads/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh @@ -24,10 +24,12 @@ rm $file_name cd pacbio_fofn_files echo "download the metadata and raw PacBio reads in .h5 format ..." -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml if [[ ! -d Analysis_Results ]] then @@ -35,39 +37,64 @@ then fi cd Analysis_Results -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 + +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 + +### echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn + echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn + echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn + echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn + +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn + + cd .. cd .. diff --git a/Project_Template/00.Long_Reads/nanopore_fast5_files/.gitkeep b/Project_Template/00.Long_Reads/nanopore_fast5_files/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/Project_Template/00.Long_Reads/pacbio_fofn_files/.gitkeep b/Project_Template/00.Long_Reads/pacbio_fofn_files/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/Project_Template/00.Long_Reads/pacbio_fofn_files/Analysis_Results/.gitkeep b/Project_Template/00.Long_Reads/pacbio_fofn_files/Analysis_Results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/Project_Template/01.Long-read-based_Genome_Assembly/LRSDAY.01.Long-read-based_Genome_Assembly.sh b/Project_Template/01.Long-read-based_Genome_Assembly/LRSDAY.01.Long-read-based_Genome_Assembly.sh index 222af03..bda2793 100755 --- a/Project_Template/01.Long-read-based_Genome_Assembly/LRSDAY.01.Long-read-based_Genome_Assembly.sh +++ b/Project_Template/01.Long-read-based_Genome_Assembly/LRSDAY.01.Long-read-based_Genome_Assembly.sh @@ -12,8 +12,8 @@ long_reads="./../00.Long_Reads/SK1.filtered_subreads.fastq.gz" # The file path o long_reads_type="pacbio-raw" # The long reads data type. Use "pacbio-raw" or "pacbio-corrected" or "nanopore-raw" or "nanopore-corrected". Default = "pacbio-raw" for the testing example genome_size="12.5m" # The estimated genome size with the format of <number>[g|m|k], e.g. 12.5m for 12.5 Mb. Default = "12.5m". assembler="canu" # The long-read assembler: Use "canu" or "flye" or "wtdbg2" or "smartdenovo" or "canu-flye" or "canu-wtdbg2" or "canu-smartdenovo". For "canu-flye", "canu-wtdbg2", and "canu-smartdenovo", the assembler canu is used first to generate error-corrected reads from the raw reads and then the assembler flye/wtdbg2/smartdenovo is used to assemble the genome. Based on our test, assembler="canu" generally gives the best result but will take substantially longer time than the other options. -customized_canu_parameters="-correctedErrorRate=0.04" # For assembler="canu" only. Users can set customized Canu assembly parameters here or simply leave it empty like "" to use Canu's default assembly parameter. For example you could set "-correctedErrorRate=0.04" for high coverage (>60X) PacBio data and "-overlapper=mhap -utgReAlign=true" for high coverage (>60X) Nanopore data to improve the assembly speed. More than one customized parameters can be set here as long as they are separeted by space (e.g. "-option1=XXX -option2=YYY -option3=ZZZ"). Please consult Canu's manual "http://canu.readthedocs.io/en/latest/faq.html#what-parameters-can-i-tweak" for advanced customization settings. Default = "-correctedErrorRate=0.04" for the testing example. -threads=1 # The number of threads to use. Default = 1. +customized_canu_parameters="correctedErrorRate=0.04" # For assembler="canu" only. Users can set customized Canu assembly parameters here or simply leave it empty like customized_canu_parameters="" to use Canu's default assembly parameter. For example you could set customized_canu_parameters="correctedErrorRate=0.04" for high coverage (>60X) PacBio data and customized_canu_parameters="overlapper=mhap;utgReAlign=true" for high coverage (>60X) Nanopore data to improve the assembly speed. When assembling genomes with high heterozygosity, you can could set customized_canu_parameters="corOutCoverage=200;batOptions=-dg 3 -db 3 -dr 1 -ca 500 -cp 50" to avoid collasping haplotypes. As shown in these examples, more than one customized parameters can be set here as long as they are separeted by a semicolon and contained in a pair of double quotes (e.g. customized_canu_parameters="option1=XXX;option2=YYY;option3=ZZZ"). Please consult Canu's manual "http://canu.readthedocs.io/en/latest/faq.html#what-parameters-can-i-tweak" for advanced customization settings. Default = "correctedErrorRate=0.04" for the testing example. +threads=2 # The number of threads to use. Default = 2. vcf="yes" # Use "yes" if prefer to have vcf file generated to show SNP and INDEL differences between the assembled genome and the reference genome for their uniquely alignable regions. Otherwise use "no". Default = "yes". dotplot="yes" # Use "yes" if prefer to plot genome-wide dotplot based on the comparison with the reference genome below. Otherwise use "no". Default = "yes". ref_genome_raw="./../00.Ref_Genome/S288C.ASM205763v1.fa" # The file path of the raw reference genome. This is only needed when the option "dotplot=" or "vcf=" has been set as "yes". @@ -45,20 +45,23 @@ then fi fi - - out_dir=${prefix}_${assembler}_out if [[ "$assembler" == "canu" ]] then + OLDIFS=$IFS; + IFS=";" + customized_canu_parameters_array=($customized_canu_parameters) + IFS=$OLDIFS; + printf "%s\n" "${customized_canu_parameters_array[@]}" > $prefix.customized_canu_parameters.spec $canu_dir/canu -p $prefix -d $out_dir \ + -s $prefix.customized_canu_parameters.spec \ useGrid=false \ maxThreads=$threads \ genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ - -${long_reads_type} $long_reads \ - $customized_canu_parameters - + -${long_reads_type} $long_reads + mv $prefix.customized_canu_parameters.spec ./$out_dir/ perl $LRSDAY_HOME/scripts/simplify_seq_name.pl -i $out_dir/$prefix.contigs.fasta -o $prefix.assembly.$assembler.fa elif [[ "$assembler" == "flye" ]] then @@ -104,7 +107,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters if [[ "$long_reads_type" == "pacbio-raw" ]] then @@ -133,7 +135,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters mkdir -p $out_dir/wtdbg2 cd $out_dir/wtdbg2 @@ -149,7 +150,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters mkdir -p $out_dir/smartdenovo cd $out_dir/smartdenovo @@ -195,7 +195,6 @@ then rm *.delta rm *.delta_filter rm ref_genome.fa - rm ref_genome.fa.fai if [[ $vcf == "yes" ]] then rm *.filter.coords diff --git a/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh b/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh index 745220c..fbcb282 100755 --- a/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh +++ b/Project_Template/02.Long-read-based_Assembly_Polishing/LRSDAY.02.Long-read-based_Assembly_Polishing.sh @@ -17,7 +17,7 @@ pacbio_reads_type="RSII" # The sequencing machine used to generate the input Pac ### When long_read_technology="nanopore" ### nanopore_fast5_files="./../00.Long_Reads/nanopore_fast5_files" # The file path to the directory containing raw Oxford Nanopore FAST5 files. -nanopore_albacore_sequencing_summary="./../00.Long_Reads/nanopore_fast5_files/sequencing_summary.txt" # The file path to the albacore basecaller sequencing summary output. This summary file is not necessary but it can help the polishing step to run much faster when available. When this file is unavailable, set nanopore_albacore_sequencing_summary="". +nanopore_basecalling_sequencing_summary="./../00.Long_Reads/nanopore_fast5_files/sequencing_summary.txt" # The file path to the nanopore albacore/guppy basecaller sequencing summary output. This summary file is not necessary but it can help the polishing step to run much faster when available. When this file is unavailable, set nanopore_albacore_sequencing_summary="". prefix="SK1" # The file name prefix for the output files. Default = "SK1" for the testing example. @@ -46,7 +46,7 @@ then $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid fi else - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam if [[ $ploidy == "1" ]] then $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf @@ -61,11 +61,11 @@ then else # perform correction using the minimap2-nanopolish pipeline source $nanopolish_dir/py3_virtualenv_nanopolish/bin/activate - if [[ -z "$nanopore_albacore_sequencing_summary" ]] + if [[ -z "$nanopore_basecalling_sequencing_summary" ]] then $nanopolish_dir/nanopolish index -d $nanopore_fast5_files $long_reads_in_fastq else - $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_albacore_sequencing_summary $long_reads_in_fastq + $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_basecalling_sequencing_summary $long_reads_in_fastq fi java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=$prefix.assembly.raw.fa OUTPUT=$prefix.assembly.raw.dict $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.raw.fa $long_reads_in_fastq > $prefix.minimap2.sam diff --git a/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh b/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh index 483228a..e02c052 100755 --- a/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh +++ b/Project_Template/03.Illumina-read-based_Assembly_Polishing/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh @@ -9,7 +9,7 @@ source ./../../env.sh input_assembly="./../02.Long-read-based_Assembly_Polishing/SK1.assembly.long_read_polished.fa" # The file path of the input assembly before Illumina-based correction prefix="SK1" # The file name prefix for the output files. -threads=1 # The number of threads to use. Default = "1". +threads=1 # The number of threads to use. Default = "1". mode="PE" # Illumina sequencing mode, "PE" for paired-end sequencing and "SE" for single-end sequencing. Default = "PE". fixlist="snps,indels" # The types of errors for Illumina-read-based correction by Pilon; see Pilon's manual for more details. Default = "snps,indels". if [[ $mode == "PE" ]] @@ -60,30 +60,57 @@ fi # index reference sequence $samtools_dir/samtools faidx refseq.fa -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=refseq.fa OUTPUT=refseq.dict +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ + -REFERENCE refseq.fa \ + -OUTPUT refseq.dict # sort bam file by picard-tools SortSam -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam INPUT=$prefix.sam OUTPUT=$prefix.sort.bam SORT_ORDER=coordinate +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ + -INPUT $prefix.sam \ + -OUTPUT $prefix.sort.bam \ + -SORT_ORDER coordinate # fixmate -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation INPUT=$prefix.sort.bam OUTPUT=$prefix.fixmate.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ + -INPUT $prefix.sort.bam \ + -OUTPUT $prefix.fixmate.bam # add or replace read groups and sort -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups INPUT=$prefix.fixmate.bam OUTPUT=$prefix.rdgrp.bam \ - SORT_ORDER=coordinate RGID=$prefix RGLB=$prefix RGPL="Illumina" RGPU=$prefix RGSM=$prefix RGCN="RGCN" +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ + -INPUT $prefix.fixmate.bam \ + -OUTPUT $prefix.rdgrp.bam \ + -SORT_ORDER coordinate \ + -RGID $prefix \ + -RGLB $prefix \ + -RGPL "Illumina" \ + -RGPU $prefix \ + -RGSM $prefix \ + -RGCN "RGCN" # remove duplicates -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates INPUT=$prefix.rdgrp.bam REMOVE_DUPLICATES=true \ - METRICS_FILE=$prefix.dedup.matrics OUTPUT=$prefix.dedup.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ + -INPUT $prefix.rdgrp.bam \ + -REMOVE_DUPLICATES true \ + -METRICS_FILE $prefix.dedup.matrics \ + -OUTPUT $prefix.dedup.bam # index the dedup.bam file $samtools_dir/samtools index $prefix.dedup.bam # GATK local realign # find realigner targets -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar -R refseq.fa -T RealignerTargetCreator -I $prefix.dedup.bam -o $prefix.realn.intervals +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.fa \ + -T RealignerTargetCreator \ + -I $prefix.dedup.bam \ + -o $prefix.realn.intervals # run realigner -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar -R refseq.fa -T IndelRealigner -I $prefix.dedup.bam -targetIntervals $prefix.realn.intervals -o $prefix.realn.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.fa \ + -T IndelRealigner \ + -I $prefix.dedup.bam \ + -targetIntervals $prefix.realn.intervals \ + -o $prefix.realn.bam # index final bam file $samtools_dir/samtools index $prefix.realn.bam @@ -91,9 +118,23 @@ $samtools_dir/samtools index $prefix.realn.bam # for PE sequencing if [[ $mode == "PE" ]] then - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar --genome $input_assembly --frags $prefix.realn.bam --fix $fixlist --vcf --changes --output $prefix.assembly.illumina_read_polished >$prefix.log + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome $input_assembly \ + --frags $prefix.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.illumina_read_polished \ + >$prefix.log else - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar --genome $input_assembly --unpaired $prefix.realn.bam --fix $fixlist --vcf --changes --output $prefix.assembly.illumina_read_polished >$prefix.log + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome $input_assembly \ + --unpaired $prefix.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.illumina_read_polished \ + >$prefix.log fi perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.illumina_read_polished.changes diff --git a/Project_Template/06.Mitochondrial_Genome_Assembly_Improvement/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh b/Project_Template/06.Mitochondrial_Genome_Assembly_Improvement/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh index 507ec98..f936551 100755 --- a/Project_Template/06.Mitochondrial_Genome_Assembly_Improvement/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh +++ b/Project_Template/06.Mitochondrial_Genome_Assembly_Improvement/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh @@ -83,20 +83,24 @@ fi # clean up intermediate files if [[ $debug == "no" ]] then - rm *.filter.fplot - rm *.filter.rplot - rm *.delta - rm *.delta_filter - rm $prefix.assembly.mt_contig.* - rm $prefix.assembly.non_mt_contig.* - rm $prefix.assembly.mt_improved.chrMT.filter.ps - rm $prefix.assembly.mt_improved.chrMT.filter.gp - rm $prefix.assembly.mt_improved.chrMT.filter_adjust.gp - rm ref.chrMT.list - rm ref.chrMT.fa - rm $prefix.assembly.non_primary_contig.list - rm $prefix.assembly.primary_contig.fa - rm $prefix.assembly.non_primary_contig.fa + + if [[ $(egrep -c "^>" "$prefix.assembly.mt_contig.fa") -ne 0 ]] + then + rm *.filter.fplot + rm *.filter.rplot + rm *.delta + rm *.delta_filter + rm $prefix.assembly.mt_contig.* + rm $prefix.assembly.non_mt_contig.* + rm $prefix.assembly.mt_improved.chrMT.filter.ps + rm $prefix.assembly.mt_improved.chrMT.filter.gp + rm $prefix.assembly.mt_improved.chrMT.filter_adjust.gp + rm ref.chrMT.list + rm ref.chrMT.fa + rm $prefix.assembly.non_primary_contig.list + rm $prefix.assembly.primary_contig.fa + rm $prefix.assembly.non_primary_contig.fa + fi if [ -e "$prefix.assembly.for_fixstart.for_skip.fa" ] then rm $prefix.assembly.for_fixstart.for_skip.fa diff --git a/README.md b/README.md index a83afb7..1b215a4 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Jia-Xing Yue & Gianni Liti. (2018) Long-read sequencing data analysis for yeasts Jia-Xing Yue, Jing Li, Louise Aigrain, Johan Hallin, Karl Persson, Karen Oliver, Anders Bergström, Paul Coupland, Jonas Warringer, Marco Cosentino Lagomarsino, Gilles Fischer, Richard Durbin, Gianni Liti. (2017) Contrasting evolutionary genome dynamics between domesticated and wild yeasts. *Nature Genetics*, 49:913-924. ## Release history +* v1.3.1 Released on 2019/01/22 * v1.3.0 Released on 2018/11/13 * v1.2.0 Released on 2018/10/15 * v1.1.0 Released on 2018/07/11 @@ -29,7 +30,7 @@ LRSDAY itself is distributed under the MIT license. A number of LRSDAY's depende ## Requirements ### Hardware, operating system and network -This protocol is designed for a desktop or computing server running an x86-64-bit Linux operating system. Multithreaded processors are preferred to speed up the process since many steps can be configured to use multiple threads in parallel. For assembling and analyzing the budding yeast genomes (genome size = ~12.5 Mb), at least 16 Gb of RAM and 100 Gb of free disk space are recomended. When adapted for other eukaryotic organisms with larger genome sizes, the RAM and disk space consumption will scale up, majorly during de novo genome assembly (performed by [Canu](https://github.com/marbl/canu). Plese refer to [Canu’s manual](http://canu.readthedocs.io/en/latest/) for suggested RAM and disk space consumption for assembling large genomes. Stable Internet connection is required for the installation and configuration of LRSDAY as well as for retrieving the test data. +This protocol is designed for a desktop or computing server running an x86-64-bit Linux operating system. Multithreaded processors are preferred to speed up the process since many steps can be configured to use multiple threads in parallel. For assembling and analyzing the budding yeast genomes (genome size = ~12.5 Mb), at least 16 Gb of RAM and 100 Gb of free disk space are recomended. When adapted for other eukaryotic organisms with larger genome sizes, the RAM and disk space consumption will scale up, majorly during *de novo* genome assembly (performed by [Canu](https://github.com/marbl/canu) in default. Plese refer to [Canu’s manual](http://canu.readthedocs.io/en/latest/) for suggested RAM and disk space consumption for assembling large genomes. Stable Internet connection is required for the installation and configuration of LRSDAY as well as for retrieving the test data. ### Software or library requirements @@ -44,7 +45,7 @@ This protocol is designed for a desktop or computing server running an x86-64-bi * Java runtime environment (JRE) v1.8.0 or newer (https://www.java.com) * Perl v5.12 or newer (https://www.perl.org/) * Python v2.7.9 or newer (https://www.python.org/) -* Python v3.6 or newer (https://www.python.org/) +* Python v3.4 or newer (https://www.python.org/) * Tar (https://www.gnu.org/software/tar/) * Unzip (http://infozip.sourceforge.net/UnZip.html) * Virtualenv v15.1.0 or newer (https://virtualenv.pypa.io) diff --git a/install_dependencies.sh b/install_dependencies.sh index dfd821e..6a15f55 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -1,74 +1,74 @@ #!/bin/bash -# last update: 2018/10/26 +# last update: 2019/01/18 set -e -o pipefail LRSDAY_HOME=$(pwd) BUILD="build" -SRA_VERSION="2.9.2" # "2.9.2" -PORECHOP_VERSION="0.2.4" # "0.2.3" +SRA_VERSION="2.9.2" # released on 2018.09.26 +PORECHOP_VERSION="0.2.4" # PORECHOP_GITHUB_COMMIT_VERSION="109e437" # committed on 2018.10.19 -FILTLONG_VERSION="0.2.0" +FILTLONG_VERSION="0.2.0" # FILTLONG_GITHUB_COMMIT_VERSION="d1bb46d" # committed on 2018.05.11 -MINIMAP2_VERSION="2.13" -CANU_VERSION="1.8" # "1.7.1" -FLYE_VERSION="2.3.6" # "2.3.6" -WTDBG2_VERSION="2.2" -WTDBG2_GITHUB_COMMIT_VERSION="db346b3" # committed on 2018.11.05 -SMARTDENOVO_VERSION="" # not available, so we use the github comit hash below for version control +MINIMAP2_VERSION="2.13" # released on 2018.10.11 +CANU_VERSION="1.8" # released on 2018.10.23 +FLYE_VERSION="2.4" # released on 2018.01.15 +WTDBG2_VERSION="2.3" # +WTDBG2_GITHUB_COMMIT_VERSION="4a39621" # committed on 2019.01.15 +SMARTDENOVO_VERSION="" # SMARTDENOVO_GITHUB_COMMIT_VERSION="5cc1356" # committed on 2018.02.19 -RAGOUT_VERSION="2.1.1" +RAGOUT_VERSION="2.1.1" # released on 2018.07.30 #QUAST_VERSION="5.0.1" # one of its dependency needs "csh" to be pre-installed -HDF_VERSION="1.10.1" -SONLIB_VERSION="" # not available, so we use the github comit hash below for version control +HDF_VERSION="1.10.1" # +SONLIB_VERSION="" # SONLIB_GITHUB_COMMIT_VERSION="1afbd97" # committed on 2017.08.09 HAL_VERSION="" # not available, so we use the github comit hash below for version control HAL_GITHUB_COMMIT_VERSION="a2ad656" # committed on 2017.09.09 -MUMMER_VERSION="4.0.0beta2" -GNUPLOT_VERSION="4.6.6" -BEDTOOLS_VERSION="2.27.1" -SPADES_VERSION="3.13.0" # "3.12.0" -PRODIGAL_VERSION="2.6.3" +MUMMER_VERSION="4.0.0beta2" # released on 2017.10.14 +GNUPLOT_VERSION="4.6.6" # released on 2015.02.18 +BEDTOOLS_VERSION="2.27.1" # released on 2017.12.14 +SPADES_VERSION="3.13.0" # released on 2018.10.16 +PRODIGAL_VERSION="2.6.3" # released on 2016.02.12 CAP_VERSION="" # see http://seq.cs.iastate.edu/cap3.html -BWA_VERSION="0.7.17" # "0.7.15" -SAMTOOLS_VERSION="1.9" #"1.3" -CIRCLATOR_VERSION="1.5.5" # "1.5.1" -TRIMMOMATIC_VERSION="0.36" -GATK_VERSION="3.6-6" -PICARD_VERSION="2.18.15" # "2.18.12" -PILON_VERSION="1.22" -EXONERATE_VERSION="2.2.0" -BLAST_VERSION="2.2.31" -RMBLAST_VERSION="2.2.28" -SNAP_VERSION="" # not available, so we use the github comit hash below for version control +BWA_VERSION="0.7.17" # released on 2017.10.23 +SAMTOOLS_VERSION="1.9" # released on 2018.07.18 +CIRCLATOR_VERSION="1.5.5" # released on 2018.01.31 +TRIMMOMATIC_VERSION="0.36" # +GATK_VERSION="3.6-6" # +PICARD_VERSION="2.18.15" # released on 2018.10.22 +PILON_VERSION="1.22" # released on 2017.03.15 +EXONERATE_VERSION="2.2.0" # +BLAST_VERSION="2.2.31" # +RMBLAST_VERSION="2.2.28" # +SNAP_VERSION="" # SNAP_GITHUB_COMMIT_VERSION="a89d68e" # committed on 2017.05.18 -RAPSEARCH_VERSION="2.24" -TRNASCAN_VERSION="1.3.1" -SNOSCAN_VERSION="0.9.1" -REPEATMASKER_VERSION="open-4-0-7" -TRF_VERSION="409" +RAPSEARCH_VERSION="2.24" # +TRNASCAN_VERSION="1.3.1" # +SNOSCAN_VERSION="0.9.1" # +REPEATMASKER_VERSION="open-4-0-7" # +TRF_VERSION="409" # REANNOTATE_VERSION="17.03.2015-LongQueryName" -CLUSTALW_VERSION="2.1" -MUSCLE_VERSION="3.8.31" -HMMER_VERSION="3.2.1" -BAMTOOLS_VERSION="2.4.2" -AUGUSTUS_VERSION="3.2.3" -#AUGUSTUS_GITHUB_COMMIT_VERSION="79960C5" -EVM_VERSION="1.1.1" -PROTEINORTHO_VERSION="5.16b" # "5.16" -MAKER_VERSION="3.00.0-beta" -MINICONDA2_VERSION="4.5.11" -PB_ASSEMBLY_VERSION="0.0.2" -BAX2BAM_VERSION="0.0.9" -NANOPOLISH_VERSION="0.10.2" -NANOPOLISH_GITHUB_COMMIT_VERSION="d7c09ab" # committed on 2018.11.02 -PARALLEL_VERSION="20180722" +CLUSTALW_VERSION="2.1" # +MUSCLE_VERSION="3.8.31" # +HMMER_VERSION="3.2.1" # released on 2018.06.13 +BAMTOOLS_VERSION="2.4.2" # released on 2017.11.02 +AUGUSTUS_VERSION="3.2.3" # +#AUGUSTUS_GITHUB_COMMIT_VERSION="79960c5" +EVM_VERSION="1.1.1" # released on 2015.07.03 +PROTEINORTHO_VERSION="5.16b" # released on 2017.09.22 +MAKER_VERSION="3.00.0-beta" # +MINICONDA2_VERSION="4.5.11" # +PB_ASSEMBLY_VERSION="0.0.2" # +BAX2BAM_VERSION="0.0.9" # +NANOPOLISH_VERSION="0.11.0" # +NANOPOLISH_GITHUB_COMMIT_VERSION="3180474" # commited on 2019.01.18 +PARALLEL_VERSION="20180722" # released on 2018.07.22 # for MFannot -EMBOSS_VERSION="6.5.7" -ERPIN_VERSION="5.5.4" -TBL2ASN_VERSION="" -PIROBJECT_VERSION="1.19" +EMBOSS_VERSION="6.5.7" # released on 2012.07.25 +ERPIN_VERSION="5.5.4" # +TBL2ASN_VERSION="" # +PIROBJECT_VERSION="1.19" # PIRMODELS_GITHUB_COMMIT_VERSION="6b223ec" # committed on 2016.08.30 FLIP_GITHUB_COMMIT_VERSION="00a57cb" # committed on 2016.04.07 UMAC_GITHUB_COMMIT_VERSION="cae618e" # committed on 2016.08.30 @@ -77,7 +77,7 @@ RNAFINDER_GITHUB_COMMIT_VERSION="579dc58" # committed on 2016.12.07 MF2SQN_GITHUB_COMMIT_VERSION="6faf9f4" # committed on 2016.12.07 GRAB_FASTA_GITHUB_COMMIT_VERSION="accd32d" # committed on 2017.02.14 MFANNOT_DATA_GITHUB_COMMIT_VERSION="b039ac5" # committed on 2016.12.07 -MFANNOT_VERSION="1.35" +MFANNOT_VERSION="1.35" # MFANNOT_GITHUB_COMMIT_VERSION="6472b97" # committed on 2018.10.31 # downloading URLs for dependencies @@ -185,7 +185,7 @@ download () { url=$1 download_location=$2 echo "Downloading $url to $download_location" - wget --no-check-certificate $url -O $download_location + wget -nv --no-check-certificate $url -O $download_location } # ---------- set Perl & Python environment variables ------------- @@ -196,7 +196,7 @@ PERL5LIB="$build_dir/cpanm/perlmods/lib/perl5:$PERL5LIB" mkdir -p $build_dir/cpanm cpanm_dir=$build_dir/cpanm cd $cpanm_dir -wget --no-check-certificate -O - https://cpanmin.us/ > cpanm +wget -nv --no-check-certificate -O - https://cpanmin.us/ > cpanm chmod +x cpanm mkdir perlmods $cpanm_dir/cpanm -l $cpanm_dir/perlmods --skip-installed Test::More@1.302086 @@ -243,7 +243,7 @@ cd $build_dir echo "Download Porechop-v${PORECHOP_VERSION}" git clone $PORECHOP_DOWNLOAD_URL cd Porechop -git reset --hard $PORECHOP_GITHUB_COMMIT_VERSION +git checkout -f -q $PORECHOP_GITHUB_COMMIT_VERSION virtualenv -p $(which python3) py3_virtualenv_porechop source py3_virtualenv_porechop/bin/activate py3_virtualenv_porechop/bin/python3 ./setup.py install @@ -255,7 +255,7 @@ cd $build_dir echo "Download Filtlong-v${FILTLONG_VERSION}" git clone $FILTLONG_DOWNLOAD_URL cd Filtlong -git reset --hard $FILTLONG_GITHUB_COMMIT_VERSION +git checkout -f -q $FILTLONG_GITHUB_COMMIT_VERSION make -j filtlong_dir="$build_dir/Filtlong/bin" @@ -298,7 +298,7 @@ cd $build_dir echo "Download wtdbg2-v${WTDBG2_VERSION}" git clone $WTDBG2_DOWNLOAD_URL cd wtdbg2 -git reset --hard $WTDBG2_GITHUB_COMMIT_VERSION +git checkout -f -q $WTDBG2_GITHUB_COMMIT_VERSION C_INCLUDE_PATH="" make wtdbg2_dir="$build_dir/wtdbg2" @@ -308,7 +308,7 @@ cd $build_dir echo "Download smartdenovo-v${SMARTDENOVO_VERSION}" git clone $SMARTDENOVO_DOWNLOAD_URL cd smartdenovo -git reset --hard $SMARTDENOVO_GITHUB_COMMIT_VERSION +git checkout -f -q $SMARTDENOVO_GITHUB_COMMIT_VERSION cp wtlay.h wtlay.h.bk cat wtlay.h.bk |sed s/inline//g > wtlay.h C_INCLUDE_PATH="" @@ -349,7 +349,7 @@ rm hdf5-${HDF_VERSION}.tar.gz cd $build_dir git clone $SONLIB_DOWNLOAD_URL cd sonLib -git reset --hard $SONLIB_GITHUB_COMMIT_VERSION +git checkout -f -q $SONLIB_GITHUB_COMMIT_VERSION make # ---------------- HAL ------------------- @@ -357,7 +357,7 @@ cd $build_dir echo "Download HAL-v${HAL_VERSION}" git clone $HAL_DOWNLOAD_URL cd hal -git reset --hard $HAL_GITHUB_COMMIT_VERSION +git checkout -f -q $HAL_GITHUB_COMMIT_VERSION make hal_dir="$build_dir/hal/bin" @@ -546,7 +546,7 @@ echo "Download snap-v${SNAP_VERSION}" git clone $SNAP_DOWNLOAD_URL snap_dir="$build_dir/SNAP" cd $snap_dir -git reset --hard $SNAP_GITHUB_COMMIT_VERSION +git checkout -f -q $SNAP_GITHUB_COMMIT_VERSION ZOE="$snap_dir/Zoe" cp $LRSDAY_HOME/misc/snap.c . # temporary fix for snap with gcc-8 make @@ -705,7 +705,7 @@ rm augustus-${AUGUSTUS_VERSION}.tar.gz # git clone $AUGUSTUS_DOWNLOAD_URL # cd Augustus # git submodule update --init -# git reset --hard $AUGUSTUS_GITHUB_COMMIT_VERSION +# git checkout -f -q $AUGUSTUS_GITHUB_COMMIT_VERSION # cd $build_dir/augustus-${AUGUSTUS_VERSION}/auxprogs/bam2hints/ # cp $LRSDAY_HOME/misc/bam2hints.Makefile Makefile # cd $build_dir/augustus-${AUGUSTUS_VERSION}/auxprogs/filterBam/src/ @@ -791,7 +791,7 @@ echo "Download nanopolish-v${NANOPOLISH_VERSION}" git clone --recursive $NANOPOLISH_DOWNLOAD_URL nanopolish_dir="$build_dir/nanopolish" cd $nanopolish_dir -git reset --hard $NANOPOLISH_GITHUB_COMMIT_VERSION +git checkout -f -q $NANOPOLISH_GITHUB_COMMIT_VERSION make virtualenv -p $(which python3) py3_virtualenv_nanopolish source py3_virtualenv_nanopolish/bin/activate @@ -864,7 +864,7 @@ cd $build_dir echo "Download PirModels" git clone $PIRMODELS_DOWNLOAD_URL cd PirModels -git reset --hard $PIRMODELS_GITHUB_COMMIT_VERSION +git checkout -f -q $PIRMODELS_GITHUB_COMMIT_VERSION cd .. cp -r PirModels $pirobject_dir pirmodels_dir="$perlobject_dir/PirModels" @@ -874,7 +874,7 @@ cd $build_dir echo "Download Flip" git clone $FLIP_DOWNLOAD_URL cd Flip -git reset --hard $FLIP_GITHUB_COMMIT_VERSION +git checkout -f -q $FLIP_GITHUB_COMMIT_VERSION cd src make cp flip ./../ @@ -885,7 +885,7 @@ cd $build_dir echo "Download Umac" git clone $UMAC_DOWNLOAD_URL cd Umac -git reset --hard $UMAC_GITHUB_COMMIT_VERSION +git checkout -f -q $UMAC_GITHUB_COMMIT_VERSION umac_dir="$build_dir/Umac" # --------------- HMMsearchWC ------------------ @@ -893,7 +893,7 @@ cd $build_dir echo "Download HMMsearchWC" git clone $HMMSEARCHWC_DOWNLOAD_URL cd HMMsearchWC -git reset --hard $HMMSEARCHWC_GITHUB_COMMIT_VERSION +git checkout -f -q $HMMSEARCHWC_GITHUB_COMMIT_VERSION hmmsearchwc_dir="$build_dir/HMMsearchWC" # --------------- RNAfinder ------------------ @@ -901,7 +901,7 @@ cd $build_dir echo "Download RNAfinder" git clone $RNAFINDER_DOWNLOAD_URL cd RNAfinder -git reset --hard $RNAFINDER_GITHUB_COMMIT_VERSION +git checkout -f -q $RNAFINDER_GITHUB_COMMIT_VERSION rnafinder_dir="$build_dir/RNAfinder" # --------------- Mf2sqn ------------------ @@ -909,7 +909,7 @@ cd $build_dir echo "Download Mf2sqn" git clone $MF2SQN_DOWNLOAD_URL cd Mf2sqn -git reset --hard $MF2SQN_GITHUB_COMMIT_VERSION +git checkout -f -q $MF2SQN_GITHUB_COMMIT_VERSION mf2sqn_dir="$build_dir/Mf2sqn" cp qualifs.pl $build_dir/cpanm/perlmods/lib/perl5 @@ -918,7 +918,7 @@ cd $build_dir echo "Download grab-fasta" git clone $GRAB_FASTA_DOWNLOAD_URL cd grab-fasta -git reset --hard $GRAB_FASTA_GITHUB_COMMIT_VERSION +git checkout -f -q $GRAB_FASTA_GITHUB_COMMIT_VERSION grab_fasta_dir="$build_dir/grab-fasta" # --------------- MFannot_data ------------------ @@ -926,7 +926,7 @@ cd $build_dir echo "Download MFannot_data" git clone $MFANNOT_DATA_DOWNLOAD_URL cd MFannot_data -git reset --hard $MFANNOT_DATA_GITHUB_COMMIT_VERSION +git checkout -f -q $MFANNOT_DATA_GITHUB_COMMIT_VERSION mfannot_data_dir="$build_dir/MFannot_data" # --------------- MFannot ------------------ @@ -934,7 +934,7 @@ cd $build_dir echo "Download MFannot" git clone $MFANNOT_DOWNLOAD_URL cd MFannot -git reset --hard $MFANNOT_GITHUB_COMMIT_VERSION +git checkout -f -q $MFANNOT_GITHUB_COMMIT_VERSION mfannot_dir="$build_dir/MFannot" diff --git a/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh b/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh index 51464bf..82d5682 100755 --- a/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh +++ b/pipelines/LRSDAY.00.Long_Reads_Preprocessing.sh @@ -24,7 +24,7 @@ echo "genome_size=$genome_size, post_filtering_coverage=$post_filtering_coverage echo "" if [[ "$reads_type" == "nanopore-raw" || "$reads_type" == "nanopore-corrected" ]] then - $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --threads $threads > $prefix.porechop.summary.txt + $porechop_dir/porechop -i $reads -o $prefix.porechop.fastq.gz --discard_middle --threads $threads > $prefix.porechop.summary.txt if [[ "$run_filtering" == "yes" ]] then $filtlong_dir/filtlong --min_length 1000 --keep_percent 90 --target_bases $filtlong_target_bases $prefix.porechop.fastq.gz | gzip > $prefix.filtlong.fastq.gz diff --git a/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh b/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh index 3ceac14..b2f9564 100755 --- a/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh +++ b/pipelines/LRSDAY.00.Retrieve_Sample_PacBio_Reads.sh @@ -24,10 +24,12 @@ rm $file_name cd pacbio_fofn_files echo "download the metadata and raw PacBio reads in .h5 format ..." -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.metadata.xml +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.metadata.xml +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.metadata.xml +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.metadata.xml if [[ ! -d Analysis_Results ]] then @@ -35,39 +37,64 @@ then fi cd Analysis_Results -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA535/ERA535258/pacbio_hdf5/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 - -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 -wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA525/ERA525888/pacbio_hdf5/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080522/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080529/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.bas.h5 + +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080536/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR108/ERR1080537/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.bas.h5 + +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 +# wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR112/ERR1124245/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.bas.h5 + +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 +wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR1140978/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.bas.h5 + +### echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn echo $(pwd)/m150811_092723_00127_c100844062550000001823187612311514_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.1.RSII_bax.fofn + echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn echo $(pwd)/m150814_233250_00127_c100823152550000001823177111031542_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.2.RSII_bax.fofn + echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn echo $(pwd)/m150911_220012_00127_c100861772550000001823190702121671_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.3.RSII_bax.fofn + echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn echo $(pwd)/m150813_110541_00127_c100823112550000001823177111031581_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.4.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn +# echo $(pwd)/m150814_201337_00127_c100823152550000001823177111031541_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.5.RSII_bax.fofn + +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.1.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.2.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn +# echo $(pwd)/m150910_184604_00127_c100822732550000001823176011031536_s1_p0.3.bax.h5 >> ./../$prefix.SMRTCell.6.RSII_bax.fofn + + cd .. cd .. diff --git a/pipelines/LRSDAY.01.Long-read-based_Genome_Assembly.sh b/pipelines/LRSDAY.01.Long-read-based_Genome_Assembly.sh index 222af03..bda2793 100755 --- a/pipelines/LRSDAY.01.Long-read-based_Genome_Assembly.sh +++ b/pipelines/LRSDAY.01.Long-read-based_Genome_Assembly.sh @@ -12,8 +12,8 @@ long_reads="./../00.Long_Reads/SK1.filtered_subreads.fastq.gz" # The file path o long_reads_type="pacbio-raw" # The long reads data type. Use "pacbio-raw" or "pacbio-corrected" or "nanopore-raw" or "nanopore-corrected". Default = "pacbio-raw" for the testing example genome_size="12.5m" # The estimated genome size with the format of <number>[g|m|k], e.g. 12.5m for 12.5 Mb. Default = "12.5m". assembler="canu" # The long-read assembler: Use "canu" or "flye" or "wtdbg2" or "smartdenovo" or "canu-flye" or "canu-wtdbg2" or "canu-smartdenovo". For "canu-flye", "canu-wtdbg2", and "canu-smartdenovo", the assembler canu is used first to generate error-corrected reads from the raw reads and then the assembler flye/wtdbg2/smartdenovo is used to assemble the genome. Based on our test, assembler="canu" generally gives the best result but will take substantially longer time than the other options. -customized_canu_parameters="-correctedErrorRate=0.04" # For assembler="canu" only. Users can set customized Canu assembly parameters here or simply leave it empty like "" to use Canu's default assembly parameter. For example you could set "-correctedErrorRate=0.04" for high coverage (>60X) PacBio data and "-overlapper=mhap -utgReAlign=true" for high coverage (>60X) Nanopore data to improve the assembly speed. More than one customized parameters can be set here as long as they are separeted by space (e.g. "-option1=XXX -option2=YYY -option3=ZZZ"). Please consult Canu's manual "http://canu.readthedocs.io/en/latest/faq.html#what-parameters-can-i-tweak" for advanced customization settings. Default = "-correctedErrorRate=0.04" for the testing example. -threads=1 # The number of threads to use. Default = 1. +customized_canu_parameters="correctedErrorRate=0.04" # For assembler="canu" only. Users can set customized Canu assembly parameters here or simply leave it empty like customized_canu_parameters="" to use Canu's default assembly parameter. For example you could set customized_canu_parameters="correctedErrorRate=0.04" for high coverage (>60X) PacBio data and customized_canu_parameters="overlapper=mhap;utgReAlign=true" for high coverage (>60X) Nanopore data to improve the assembly speed. When assembling genomes with high heterozygosity, you can could set customized_canu_parameters="corOutCoverage=200;batOptions=-dg 3 -db 3 -dr 1 -ca 500 -cp 50" to avoid collasping haplotypes. As shown in these examples, more than one customized parameters can be set here as long as they are separeted by a semicolon and contained in a pair of double quotes (e.g. customized_canu_parameters="option1=XXX;option2=YYY;option3=ZZZ"). Please consult Canu's manual "http://canu.readthedocs.io/en/latest/faq.html#what-parameters-can-i-tweak" for advanced customization settings. Default = "correctedErrorRate=0.04" for the testing example. +threads=2 # The number of threads to use. Default = 2. vcf="yes" # Use "yes" if prefer to have vcf file generated to show SNP and INDEL differences between the assembled genome and the reference genome for their uniquely alignable regions. Otherwise use "no". Default = "yes". dotplot="yes" # Use "yes" if prefer to plot genome-wide dotplot based on the comparison with the reference genome below. Otherwise use "no". Default = "yes". ref_genome_raw="./../00.Ref_Genome/S288C.ASM205763v1.fa" # The file path of the raw reference genome. This is only needed when the option "dotplot=" or "vcf=" has been set as "yes". @@ -45,20 +45,23 @@ then fi fi - - out_dir=${prefix}_${assembler}_out if [[ "$assembler" == "canu" ]] then + OLDIFS=$IFS; + IFS=";" + customized_canu_parameters_array=($customized_canu_parameters) + IFS=$OLDIFS; + printf "%s\n" "${customized_canu_parameters_array[@]}" > $prefix.customized_canu_parameters.spec $canu_dir/canu -p $prefix -d $out_dir \ + -s $prefix.customized_canu_parameters.spec \ useGrid=false \ maxThreads=$threads \ genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ - -${long_reads_type} $long_reads \ - $customized_canu_parameters - + -${long_reads_type} $long_reads + mv $prefix.customized_canu_parameters.spec ./$out_dir/ perl $LRSDAY_HOME/scripts/simplify_seq_name.pl -i $out_dir/$prefix.contigs.fasta -o $prefix.assembly.$assembler.fa elif [[ "$assembler" == "flye" ]] then @@ -104,7 +107,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters if [[ "$long_reads_type" == "pacbio-raw" ]] then @@ -133,7 +135,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters mkdir -p $out_dir/wtdbg2 cd $out_dir/wtdbg2 @@ -149,7 +150,6 @@ then genomeSize=$genome_size \ gnuplot=$gnuplot_dir/gnuplot \ -${long_reads_type} $long_reads \ - # $customized_canu_parameters mkdir -p $out_dir/smartdenovo cd $out_dir/smartdenovo @@ -195,7 +195,6 @@ then rm *.delta rm *.delta_filter rm ref_genome.fa - rm ref_genome.fa.fai if [[ $vcf == "yes" ]] then rm *.filter.coords diff --git a/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh b/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh index 745220c..fbcb282 100755 --- a/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh +++ b/pipelines/LRSDAY.02.Long-read-based_Assembly_Polishing.sh @@ -17,7 +17,7 @@ pacbio_reads_type="RSII" # The sequencing machine used to generate the input Pac ### When long_read_technology="nanopore" ### nanopore_fast5_files="./../00.Long_Reads/nanopore_fast5_files" # The file path to the directory containing raw Oxford Nanopore FAST5 files. -nanopore_albacore_sequencing_summary="./../00.Long_Reads/nanopore_fast5_files/sequencing_summary.txt" # The file path to the albacore basecaller sequencing summary output. This summary file is not necessary but it can help the polishing step to run much faster when available. When this file is unavailable, set nanopore_albacore_sequencing_summary="". +nanopore_basecalling_sequencing_summary="./../00.Long_Reads/nanopore_fast5_files/sequencing_summary.txt" # The file path to the nanopore albacore/guppy basecaller sequencing summary output. This summary file is not necessary but it can help the polishing step to run much faster when available. When this file is unavailable, set nanopore_albacore_sequencing_summary="". prefix="SK1" # The file name prefix for the output files. Default = "SK1" for the testing example. @@ -46,7 +46,7 @@ then $conda_pacbio_dir/variantCaller --algorithm=quiver -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf --diploid fi else - $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam + $conda_pacbio_dir/pbalign --nproc $threads --algorithm blasr --tmpDir ./tmp $pacbio_bam_fofn_file $prefix.assembly.raw.fa $prefix.pbalign.bam if [[ $ploidy == "1" ]] then $conda_pacbio_dir/variantCaller --algorithm=arrow -x 5 -X 120 -q 20 -v -j $threads $prefix.pbalign.bam -r $prefix.assembly.raw.fa -o $prefix.assembly.consensus.fa -o $prefix.assembly.consensus.fq -o $prefix.assembly.consensus.vcf @@ -61,11 +61,11 @@ then else # perform correction using the minimap2-nanopolish pipeline source $nanopolish_dir/py3_virtualenv_nanopolish/bin/activate - if [[ -z "$nanopore_albacore_sequencing_summary" ]] + if [[ -z "$nanopore_basecalling_sequencing_summary" ]] then $nanopolish_dir/nanopolish index -d $nanopore_fast5_files $long_reads_in_fastq else - $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_albacore_sequencing_summary $long_reads_in_fastq + $nanopolish_dir/nanopolish index -d $nanopore_fast5_files -s $nanopore_basecalling_sequencing_summary $long_reads_in_fastq fi java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=$prefix.assembly.raw.fa OUTPUT=$prefix.assembly.raw.dict $minimap2_dir/minimap2 -ax map-ont $prefix.assembly.raw.fa $long_reads_in_fastq > $prefix.minimap2.sam diff --git a/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh b/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh index 483228a..e02c052 100755 --- a/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh +++ b/pipelines/LRSDAY.03.Illumina-read-based_Assembly_Polishing.sh @@ -9,7 +9,7 @@ source ./../../env.sh input_assembly="./../02.Long-read-based_Assembly_Polishing/SK1.assembly.long_read_polished.fa" # The file path of the input assembly before Illumina-based correction prefix="SK1" # The file name prefix for the output files. -threads=1 # The number of threads to use. Default = "1". +threads=1 # The number of threads to use. Default = "1". mode="PE" # Illumina sequencing mode, "PE" for paired-end sequencing and "SE" for single-end sequencing. Default = "PE". fixlist="snps,indels" # The types of errors for Illumina-read-based correction by Pilon; see Pilon's manual for more details. Default = "snps,indels". if [[ $mode == "PE" ]] @@ -60,30 +60,57 @@ fi # index reference sequence $samtools_dir/samtools faidx refseq.fa -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary REFERENCE=refseq.fa OUTPUT=refseq.dict +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar CreateSequenceDictionary \ + -REFERENCE refseq.fa \ + -OUTPUT refseq.dict # sort bam file by picard-tools SortSam -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam INPUT=$prefix.sam OUTPUT=$prefix.sort.bam SORT_ORDER=coordinate +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar SortSam \ + -INPUT $prefix.sam \ + -OUTPUT $prefix.sort.bam \ + -SORT_ORDER coordinate # fixmate -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation INPUT=$prefix.sort.bam OUTPUT=$prefix.fixmate.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar FixMateInformation \ + -INPUT $prefix.sort.bam \ + -OUTPUT $prefix.fixmate.bam # add or replace read groups and sort -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups INPUT=$prefix.fixmate.bam OUTPUT=$prefix.rdgrp.bam \ - SORT_ORDER=coordinate RGID=$prefix RGLB=$prefix RGPL="Illumina" RGPU=$prefix RGSM=$prefix RGCN="RGCN" +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar AddOrReplaceReadGroups \ + -INPUT $prefix.fixmate.bam \ + -OUTPUT $prefix.rdgrp.bam \ + -SORT_ORDER coordinate \ + -RGID $prefix \ + -RGLB $prefix \ + -RGPL "Illumina" \ + -RGPU $prefix \ + -RGSM $prefix \ + -RGCN "RGCN" # remove duplicates -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates INPUT=$prefix.rdgrp.bam REMOVE_DUPLICATES=true \ - METRICS_FILE=$prefix.dedup.matrics OUTPUT=$prefix.dedup.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $picard_dir/picard.jar MarkDuplicates \ + -INPUT $prefix.rdgrp.bam \ + -REMOVE_DUPLICATES true \ + -METRICS_FILE $prefix.dedup.matrics \ + -OUTPUT $prefix.dedup.bam # index the dedup.bam file $samtools_dir/samtools index $prefix.dedup.bam # GATK local realign # find realigner targets -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar -R refseq.fa -T RealignerTargetCreator -I $prefix.dedup.bam -o $prefix.realn.intervals +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.fa \ + -T RealignerTargetCreator \ + -I $prefix.dedup.bam \ + -o $prefix.realn.intervals # run realigner -java -Djava.io.tmpdir=./tmp -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar -R refseq.fa -T IndelRealigner -I $prefix.dedup.bam -targetIntervals $prefix.realn.intervals -o $prefix.realn.bam +java -Djava.io.tmpdir=./tmp -Dpicard.useLegacyParser=false -XX:ParallelGCThreads=$threads -jar $gatk_dir/GenomeAnalysisTK.jar \ + -R refseq.fa \ + -T IndelRealigner \ + -I $prefix.dedup.bam \ + -targetIntervals $prefix.realn.intervals \ + -o $prefix.realn.bam # index final bam file $samtools_dir/samtools index $prefix.realn.bam @@ -91,9 +118,23 @@ $samtools_dir/samtools index $prefix.realn.bam # for PE sequencing if [[ $mode == "PE" ]] then - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar --genome $input_assembly --frags $prefix.realn.bam --fix $fixlist --vcf --changes --output $prefix.assembly.illumina_read_polished >$prefix.log + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome $input_assembly \ + --frags $prefix.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.illumina_read_polished \ + >$prefix.log else - java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar --genome $input_assembly --unpaired $prefix.realn.bam --fix $fixlist --vcf --changes --output $prefix.assembly.illumina_read_polished >$prefix.log + java -Djava.io.tmpdir=./tmp -Xmx16G -XX:ParallelGCThreads=$threads -jar $pilon_dir/pilon.jar \ + --genome $input_assembly \ + --unpaired $prefix.realn.bam \ + --fix $fixlist \ + --vcf \ + --changes \ + --output $prefix.assembly.illumina_read_polished \ + >$prefix.log fi perl $LRSDAY_HOME/scripts/summarize_pilon_correction.pl -i $prefix.assembly.illumina_read_polished.changes diff --git a/pipelines/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh b/pipelines/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh index 507ec98..f936551 100755 --- a/pipelines/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh +++ b/pipelines/LRSDAY.06.Mitochondrial_Genome_Assembly_Improvement.sh @@ -83,20 +83,24 @@ fi # clean up intermediate files if [[ $debug == "no" ]] then - rm *.filter.fplot - rm *.filter.rplot - rm *.delta - rm *.delta_filter - rm $prefix.assembly.mt_contig.* - rm $prefix.assembly.non_mt_contig.* - rm $prefix.assembly.mt_improved.chrMT.filter.ps - rm $prefix.assembly.mt_improved.chrMT.filter.gp - rm $prefix.assembly.mt_improved.chrMT.filter_adjust.gp - rm ref.chrMT.list - rm ref.chrMT.fa - rm $prefix.assembly.non_primary_contig.list - rm $prefix.assembly.primary_contig.fa - rm $prefix.assembly.non_primary_contig.fa + + if [[ $(egrep -c "^>" "$prefix.assembly.mt_contig.fa") -ne 0 ]] + then + rm *.filter.fplot + rm *.filter.rplot + rm *.delta + rm *.delta_filter + rm $prefix.assembly.mt_contig.* + rm $prefix.assembly.non_mt_contig.* + rm $prefix.assembly.mt_improved.chrMT.filter.ps + rm $prefix.assembly.mt_improved.chrMT.filter.gp + rm $prefix.assembly.mt_improved.chrMT.filter_adjust.gp + rm ref.chrMT.list + rm ref.chrMT.fa + rm $prefix.assembly.non_primary_contig.list + rm $prefix.assembly.primary_contig.fa + rm $prefix.assembly.non_primary_contig.fa + fi if [ -e "$prefix.assembly.for_fixstart.for_skip.fa" ] then rm $prefix.assembly.for_fixstart.for_skip.fa diff --git a/prerequisite.txt b/prerequisite.txt index 68fe19f..d372048 100644 --- a/prerequisite.txt +++ b/prerequisite.txt @@ -19,7 +19,7 @@ This protocol is designed for a desktop or computing server running an x86-64-bi ● Java runtime environment (JRE) v1.8.0 or newer (https://www.java.com) ● Perl v5.12 or newer (https://www.perl.org/) ● Python v2.7.9 or newer (https://www.python.org/) -● Python v3.6 or newer (https://www.python.org/) +● Python v3.4 or newer (https://www.python.org/) ● Tar (https://www.gnu.org/software/tar/) ● Unzip (http://infozip.sourceforge.net/UnZip.html) ● Virtualenv v15.1.0 or newer (https://virtualenv.pypa.io) diff --git a/scripts/fasta_length.pl b/scripts/fasta_length.pl new file mode 100755 index 0000000..aaf3eda --- /dev/null +++ b/scripts/fasta_length.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Getopt::Long; + +############################################################## +# script: fasta_length.pl +# author: Jia-Xing Yue (GitHub ID: yjx1217) +# last edited: 2018.01.14 +# description: calculate the length of each sequences in the input fasta file +# example: perl fasta_length.pl -i input.fa(.gz) -o output.length.txt(.gz) -sort_by_length yes +############################################################## + +my ($input, $output, $sort_by_length); +$sort_by_length = "no"; # default: no resorting, keep the input sequence order + +GetOptions('input|i:s' => \$input, + 'output|o:s' => \$output, + 'sort_by_length|s:s' => \$sort_by_length); + + +my $input_fh = read_file($input); +my @input = (); +my %input = (); +parse_fasta_file($input_fh, \%input, \@input); + +my $output_fh = write_file($output); + +my %length = (); +foreach my $id (@input) { + my $length = length $input{$id}; + $length{$id} = $length; +} + +if ($sort_by_length eq "no") { + foreach my $id (@input) { + print $output_fh "$id\t$length{$id}\n"; + } +} else { + foreach my $id (sort {$length{$b} <=> $length{$a}} keys %length) { + print $output_fh "$id\t$length{$id}\n"; + } +} + +sub read_file { + my $file = shift @_; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "gunzip -c $file |") or die "can't open pipe to $file"; + } else { + open($fh, $file) or die "can't open $file"; + } + return $fh; +} + +sub write_file { + my $file = shift @_; + my $fh; + if ($file =~ /.gz$/) { + open($fh, "| gzip -c >$file") or die "can't open $file\n"; + } else { + open($fh, ">$file") or die "can't open $file\n"; + } + return $fh; +} + + +sub parse_list_file { + my $fh = shift @_; + my %list = (); + while (<$fh>) { + chomp; + $_ =~ s/\s+$//g; + $list{$_}++; + } + return %list; +} + + +sub parse_fasta_file { + my ($fh, $input_hashref, $input_arrayref) = @_; + my $seq_name = ""; + while (<$fh>) { + chomp; + if (/^\s*$/) { + next; + } elsif (/^\s*#/) { + next; + } elsif (/^>(.*)/) { + $seq_name = $1; + push @$input_arrayref, $seq_name; + $$input_hashref{$seq_name} = ""; + } else { + $$input_hashref{$seq_name} .= $_; + } + } +} + + diff --git a/scripts/fastq2id.pl b/scripts/fastq2id.pl new file mode 100755 index 0000000..e0f25c3 --- /dev/null +++ b/scripts/fastq2id.pl @@ -0,0 +1,61 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Getopt::Long; + +############################################################## +# script: fastq2id.pl +# author: Jia-Xing Yue (GitHub ID: yjx1217) +# last edited: 2018.12.28 +# description: convert uncompressed or compressed fastq files into the fasta format. +# example: perl fastq2id.pl -i input.fastq(.gz) -o output.id.list.txt +############################################################## + +my ($input, $output); + +GetOptions('input|i:s' => \$input, + 'output|o:s' => \$output); + + +my $input_fh = read_file($input); +my $count = 0; +my $output_fh = write_file($output); + +while (<$input_fh>) { + chomp; + if ($. % 4 == 1) { + my ($id) = ($_ =~ /^\@(\S+)/); + print $output_fh "$id\n"; + } elsif ($. % 4 == 2) { + # print $output_fh "$_\n"; + $count++; + } +} + +print "\nDone! Processed $count reads in total!\n\n"; + +sub read_file { + my $file = shift @_; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "gunzip -c $file |") or die "can't open pipe to $file"; + } + else { + open($fh, $file) or die "can't open $file"; + } + return $fh; +} + + +sub write_file { + my $file = shift @_; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "| gzip -c >$file") or die "can't open $file\n"; + } else { + open($fh, ">$file") or die "can't open $file\n"; + } + return $fh; +} + + diff --git a/scripts/guppy_reads_classifier.pl b/scripts/guppy_reads_classifier.pl new file mode 100755 index 0000000..a2da43f --- /dev/null +++ b/scripts/guppy_reads_classifier.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Getopt::Long; + +############################################################## +# script: guppy_reads_classifier.pl +# author: Jia-Xing Yue (GitHub ID: yjx1217) +# last edited: 2018.12.28 +# description: classify guppy basecalled fastq reads based on the guppy debarcoding summary file +# example: perl guppy_reads_classifier.pl -fq basecalled_reads.fastq(.gz) -s barcoding_summary.txt +############################################################## + +my ($fq, $summary); +$fq = "./../Basecalling_Guppy_out/basecalled_reads.fastq.gz"; +$summary = "barcoding_summary.txt"; + +GetOptions('fastq|fq:s' => \$fq, + 'summary|s:s' => \$summary); + +my $summary_fh = read_file($summary); +my %summary = parse_summary_file($summary_fh); + +my $fq_fh = read_file($fq); + +my %barcodes = (); +my %output = (); +my $count = 0; +my $b; +my $file; +my $fh; +while (<$fq_fh>) { + chomp; + if ($. % 4 == 1) { + my ($id) = ($_ =~ /^\@(\S+)/); + # print "id = $id\n"; + $count++; + if (exists $summary{$id}) { + $b = $summary{$id}; + # print "barcode=$b\n"; + $file = "$b.basecalled_reads.fastq"; + open($fh, ">>$file"); + } else { + die "unexpected barcode: $b\n"; + } + } + print $fh "$_\n"; +} + +print "\nDone! Processed $count reads in total!\n\n"; + +sub read_file { + my $file = shift @_; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "gunzip -c $file |") or die "can't open pipe to $file"; + } + else { + open($fh, $file) or die "can't open $file"; + } + return $fh; +} + + +sub write_file { + my $file = shift @_; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "| gzip -c >$file") or die "can't open $file\n"; + } else { + open($fh, ">$file") or die "can't open $file\n"; + } + return $fh; +} + + +sub parse_summary_file { + my $fh = shift @_; + my %summary = (); + while (<$fh>) { + chomp; + /^\s*$/ and next; + /^#/ and next; + /^read_id\tbarcode_arrangement/ and next; + my @line = split /\t/, $_; + my $read_id = $line[0]; + my $barcode_arrangement = $line[1]; + $summary{$read_id} = $barcode_arrangement; + } + return %summary; +} + + + + +