Skip to content

Commit

Permalink
various feature polishing
Browse files Browse the repository at this point in the history
  • Loading branch information
yjx1217 committed Jun 27, 2021
1 parent b6ba00d commit 4fbf1e1
Show file tree
Hide file tree
Showing 21 changed files with 697 additions and 311 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ upper_quantile=85
min_mappability=0.85 # The minimal mappability for sliding-window-based CNV profiling. Default = "0.85".
excluded_chr_list_for_cnv_profiling="" # The relative path to the list for specifying chromosomes/scaffolds/contigs to be exclued for CNV profiling. We strongly recommend to exclude the organelle (e.g. Mitochondria and Choloraplast) genomes and plasmids if exists. Use "" if there is no chromosome/scaffold/contig for exclusion. Default = "".
raw_read_length=100 # RecombineX will fix this value for simplicity. There is no need to adjust it for the actual lengths of your Illumina reads.
check_duplicates_by_windowmasker="no" # Whether to mark duplicated regions using windowmasker. Default = "no".
ram_for_windowmasker="1536" # Accessible RAM (in MB) for windowmasker. Default = 1536.

test_file_existence () {
filename=$1
Expand Down Expand Up @@ -61,9 +63,15 @@ echo "relabel sequences in the genome assembly with the genome_tag prefix .."
cat ref.genome.raw.fa |sed "s/>/>ref_/g" > ref.genome.raw.relabel.fa
$samtools_dir/samtools faidx ref.genome.raw.relabel.fa

$windowmasker_dir/windowmasker -mk_counts -in ref.genome.raw.relabel.fa -out ref.genome.raw.relabel.masking_library.ustat
if [[ $check_duplicates_by_windowmasker == "yes" ]]
then
$windowmasker_dir/windowmasker -checkdup true -mk_counts -in ref.genome.raw.relabel.fa -out ref.genome.raw.relabel.masking_library.ustat -mem $ram_for_windowmasker
else
$windowmasker_dir/windowmasker -mk_counts -in ref.genome.raw.relabel.fa -out ref.genome.raw.relabel.masking_library.ustat -mem $ram_for_windowmasker
fi

$windowmasker_dir/windowmasker -ustat ref.genome.raw.relabel.masking_library.ustat -dust true \
-in ref.genome.raw.relabel.fa -out ref.genome.softmask.relabel.fa -outfmt fasta
-in ref.genome.raw.relabel.fa -out ref.genome.softmask.relabel.fa -outfmt fasta
perl $RECOMBINEX_HOME/scripts/softmask2hardmask.pl -i ref.genome.softmask.relabel.fa -o ref.genome.hardmask.relabel.fa
$samtools_dir/samtools faidx ref.genome.hardmask.relabel.fa
### slower solution ###
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ then
fi

# map reads to the reference genome
$bwa_dir/bwa mem -t $threads -M ref.genome.raw.fa $parent1_tag.R1.trimmed.PE.fq.gz $parent1_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff - >${parent1_tag}-ref.ref.bam
$bwa_dir/bwa mem -t $threads -M ref.genome.raw.fa $parent1_tag.R1.trimmed.PE.fq.gz $parent1_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff -F 3340 -f 2 - >${parent1_tag}-ref.ref.bam

if [[ $debug == "no" ]]
then
Expand Down Expand Up @@ -396,7 +396,7 @@ then
fi

# map reads to the reference genome
$bwa_dir/bwa mem -t $threads -M ref.genome.raw.fa $parent2_tag.R1.trimmed.PE.fq.gz $parent2_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff - >${parent2_tag}-ref.ref.bam
$bwa_dir/bwa mem -t $threads -M ref.genome.raw.fa $parent2_tag.R1.trimmed.PE.fq.gz $parent2_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff -F 3340 -f 2 - >${parent2_tag}-ref.ref.bam

if [[ $debug == "no" ]]
then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ source ./../../env.sh
# set project-specific variables
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1"
master_sample_table="Master_Sample_Table.${batch_id}.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
net_quality_cutoff=30 # The net quality cutoff for genotyping. Default = "30".
net_quality_cutoff=20 # The net quality cutoff for genotyping. Default = "20".
apply_cnv_filter="yes" # Whether to set gamete genotype to NA for potential CNV regions in gametes. Set this option to "no" if the gamete sequencing depth is very low (e.g. <= 1). Default = "yes".
allow_heteroduplex="no" # Whether to consider the possibility of heteroduplex formation. Default = "no".
chr_list="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt" # The included chromosome list for the analyzed genome. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. This file is a tab-delimited two column list file in which the first column is parent_id and the second column is the hex color code. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_centromere="yes" # Whether to plot centromere in the generated genotyping plots. Please note that enable this option requires that you have the ref.centromere.relabel.gff file ready in the "./../01.Reference_Genome_Preprocessing" directory. Default = "yes".
Expand All @@ -27,6 +29,7 @@ gamete_read_mapping_dir="./../03.Gamete_Read_Mapping_to_Reference_Genome" # The
output_dir="${batch_id}" # The output directory for this batch
marker_type="SNP" # The types of markers to use: "BOTH" or "SNP". Default = "SNP".
basecall_purity_cutoff=0.9 # The basecall purity cutoff for genotyping. Default = "0.9".

#############################################


Expand Down Expand Up @@ -54,6 +57,8 @@ perl $RECOMBINEX_HOME/scripts/batch_tetrad_genotyping_by_reference_genome.pl \
-s $master_sample_table \
-q $net_quality_cutoff \
-p $basecall_purity_cutoff \
-apply_cnv_filter $apply_cnv_filter \
-allow_heteroduplex $allow_heteroduplex \
-c $chr_list \
-b $batch_id \
-marker_dir $marker_dir \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.$batch_id.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
merging_range=5000 # The distance range (bp) for merging nearby COs. Default = "5000" (i.e. 5000 bp).
net_quality_cutoff=30 # The net quality difference cutoff used in tetrad genotyping. Default = "30".
net_quality_cutoff=20 # The net quality difference cutoff used in tetrad genotyping. Default = "20".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_individual_recombination_event="yes" # Whether to plot individual recombination event, "yes" by default. Default = "yes".
flanking="4000" # The recombination event flanking region (bp) for plotting. Default = "4000".
Expand All @@ -28,7 +28,7 @@ genome_dir="./../01.Reference_Genome_Preprocessing" # The relative path to the 0
genotype_dir="./../04.Tetrad_Genotyping_by_Reference_Genome" # The relative path to the 04.Tetrad_Genotyping_by_Reference_Genome directory.
output_dir=$batch_id # output directory to create within this current directory
min_marker_number=1 # The minimal number of markers to be considered for trustful linkage blocks. Default = "1".
min_block_size=5 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "5" (i.e. 5 bp).
min_block_size=1 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "1" (i.e. 1 bp).
###########################################


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ upper_quantile=85
min_mappability=0.85 # The minimal mappability for sliding-window-based CNV profiling. Default = "0.85".
excluded_chr_list_for_cnv_profiling="" # The relative path to the list for specifying chromosomes/scaffolds/contigs to be exclued for CNV profiling. We strongly recommend to exclude the organelle (e.g. Mitochondria and Choloraplast) genomes and plasmids if exists. Use "" if there is no chromosome/scaffold/contig for exclusion. Default = "".
raw_read_length=100 # RecombineX will fix this value for simplicity. There is no need to adjust it for the actual lengths of your Illumina reads.
check_duplicates_by_windowmasker="no" # Whether to mark duplicated regions using windowmasker. Default = "no".
ram_for_windowmasker="1536" # Accessible RAM (in MB) for windowmasker. Default = 1536.

test_file_existence () {
filename=$1
Expand Down Expand Up @@ -62,7 +64,13 @@ echo "relabel sequences in the genome assembly with the genome_tag prefix .."
cat $parent_tag.genome.raw.fa |sed "s/>/>${parent_tag}_/g" > $parent_tag.genome.raw.relabel.fa
$samtools_dir/samtools faidx $parent_tag.genome.raw.relabel.fa

$windowmasker_dir/windowmasker -mk_counts -in $parent_tag.genome.raw.relabel.fa -out $parent_tag.genome.raw.relabel.masking_library.ustat
if [[ $check_duplicates_by_windowmasker == "yes" ]]
then
$windowmasker_dir/windowmasker -checkdup true -mk_counts -in $parent_tag.genome.raw.relabel.fa -out $parent_tag.genome.raw.relabel.masking_library.ustat -mem $ram_for_windowmasker
else
$windowmasker_dir/windowmasker -mk_counts -in $parent_tag.genome.raw.relabel.fa -out $parent_tag.genome.raw.relabel.masking_library.ustat -mem $ram_for_windowmasker
fi

$windowmasker_dir/windowmasker -ustat $parent_tag.genome.raw.relabel.masking_library.ustat -dust true \
-in $parent_tag.genome.raw.relabel.fa -out $parent_tag.genome.softmask.relabel.fa -outfmt fasta
perl $RECOMBINEX_HOME/scripts/softmask2hardmask.pl -i $parent_tag.genome.softmask.relabel.fa -o $parent_tag.genome.hardmask.relabel.fa
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ then
fi

$bwa_dir/bwa index $parent1_tag.genome.raw.fa
$bwa_dir/bwa mem -t $threads -M $parent1_tag.genome.raw.fa $parent2_tag.R1.trimmed.PE.fq.gz $parent2_tag.R2.trimmed.PE.fq.gz |$samtools_dir/samtools view -bS -q $mapping_quality_cutoff - >$parent1_based_prefix.bam
$bwa_dir/bwa mem -t $threads -M $parent1_tag.genome.raw.fa $parent2_tag.R1.trimmed.PE.fq.gz $parent2_tag.R2.trimmed.PE.fq.gz |$samtools_dir/samtools view -bS -q $mapping_quality_cutoff -F 3340 -f 2 - >$parent1_based_prefix.bam

if [[ $debug == "no" ]]
then
Expand Down Expand Up @@ -431,7 +431,7 @@ then
fi

$bwa_dir/bwa index $parent2_tag.genome.raw.fa
$bwa_dir/bwa mem -t $threads -M $parent2_tag.genome.raw.fa $parent1_tag.R1.trimmed.PE.fq.gz $parent1_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff - >$parent2_based_prefix.bam
$bwa_dir/bwa mem -t $threads -M $parent2_tag.genome.raw.fa $parent1_tag.R1.trimmed.PE.fq.gz $parent1_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff -F 3340 -f 2 - >$parent2_based_prefix.bam

if [[ $debug == "no" ]]
then
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
!/bin/bash
#!/bin/bash
set -e -o pipefail

##########################################
Expand All @@ -10,7 +10,9 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for gamete read mapping. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.${batch_id}.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
marker_dir="./../14.Polymorphic_Markers_by_Consensus" # The relative path to the "12.Polymorphic_Markers_by_Cross_Parent_Genome_Alignment" or "14.Polymorphic_Markers_by_Consensus" directory (for parental-genome-based analysis). Default = ./../14.Polymorphic_Markers_by_Consensus".
net_quality_cutoff=30 # The net quality cutoff for genotyping. Default = "30".
net_quality_cutoff=20 # The net quality cutoff for genotyping. Default = "20".
apply_cnv_filter="yes" # Whether to set gamete genotype to NA for potential CNV regions in gametes. Set this option to "no" if the gamete sequencing depth is very low (e.g. <= 1). Default = "yes".
allow_heteroduplex="no" # Whether to consider the possibility of heteroduplex formation. Default = "no".
chr_list="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt" # The chromosome list for the analyzed genome. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_centromere="yes" # Whether to plot centromere in the generated genotyping plots. Please note that enable this option requires that you have the parent1_tag.centromere.relabel.gff and parent2_tag.centromere.relabel.gff files ready in the "./../11.Parent_Genome_Preprocessing" directory (for parental-genome-based analysis). Default = "yes".
Expand Down Expand Up @@ -56,6 +58,8 @@ perl $RECOMBINEX_HOME/scripts/batch_tetrad_genotyping_by_parent_genomes.pl \
-p $basecall_purity_cutoff \
-c $chr_list \
-b $batch_id \
-apply_cnv_filter $apply_cnv_filter \
-allow_heteroduplex $allow_heteroduplex \
-marker_dir $marker_dir \
-marker_type $marker_type \
-gamete_read_mapping_dir $gamete_read_mapping_dir \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.$batch_id.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
merging_range=5000 # The distance range (bp) for merging nearby COs. Default = "5000" (i.e. 5000 bp).
net_quality_cutoff=30 # The net quality difference cutoff used in tetrad genotyping. Default = "30".
net_quality_cutoff=20 # The net quality difference cutoff used in tetrad genotyping. Default = "20".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_individual_recombination_event="yes" # Whether to plot individual recombination event, "yes" by default. Default = "yes".
flanking="4000" # The recombination event flanking region (bp) for plotting. Default = "4000".
Expand All @@ -24,7 +24,7 @@ genome_dir="./../11.Parent_Genome_Preprocessing" # The relative path to the 11.P
genotype_dir="./../16.Tetrad_Genotyping_by_Parent_Genomes" # The relative path to the 16.Tetrad_Genotyping_by_Parent_Genomes directory.
output_dir=$batch_id # output directory to create within this current directory
min_marker_number=1 # The minimal number of markers to be considered for trustful linkage blocks. Default = "1".
min_block_size=5 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "5" (i.e. 5 bp).
min_block_size=1 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "1" (i.e. 1 bp).
###########################################

# filtering tetrads by spore mapping depth
Expand Down
2 changes: 2 additions & 0 deletions data/P1-P2.ref.final.SNP.markers.intermarker_distance.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
prefix marker_count mean_distance median_distance stdev_distance
P1-P2.ref.final.SNP.markers 49150 242.68 162.00 344.79
Binary file added data/P1-P2.ref.final.SNP.markers.pdf
Binary file not shown.
Binary file modified data/P1-P2.ref.final.SNP.markers.txt.gz
Binary file not shown.
1 change: 1 addition & 0 deletions data/Saccharomyces_cerevisiae.color_scheme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ DBVPG6044 #ff3300
DBVPG6765 #66cc33
Y12 #0099ff
YPS128 #ffcc00
heteroduplex #ffc425
1 change: 0 additions & 1 deletion install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,6 @@ if [ -z $(check_installed $freebayes_dir) ]; then
echo "Download freebayes-v${FREEBAYES_VERSION}"
download $FREEBAYES_SOURCE_DOWNLOAD_URL freebayes-${FREEBAYES_VERSION}-src.tar.gz
tar xzf freebayes-${FREEBAYES_VERSION}-src.tar.gz
rm freebayes-${FREEBAYES_VERSION}-src.tar.gz
cd freebayes
cd bin
download $FREEBAYES_DOWNLOAD_URL "freebayes-${FREEBAYES_VERSION}-linux-static-AMD64.gz"
Expand Down
4 changes: 2 additions & 2 deletions scripts/batch_read_mapping_to_parent_genomes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
##############################################################
# script: batch_read_mapping_to_parent_genomes.pl
# author: Jia-Xing Yue (GitHub ID: yjx1217)
# last edited: 2019.09.10
# last edited: 2021.06.17
# description: run batch read mapping to parent genomes
# example: perl batch_read_mapping_to_parent_genomes.pl -i Master_Sample_Table.txt -t 4 -o output_dir -parent_genome_assembly_dir ./../01.Parent_Genome_Preprocessing -gamete_reads_dir ./../00.Gamete_Reads -min_mappability 0.85 -window_size 250 -ploidy 1
##############################################################
Expand Down Expand Up @@ -142,7 +142,7 @@
print("mapping the reads by bwa\n");
system("ln -s $parent_genome_assembly $parent_genome_tag.genome.fa");
system("$bwa_dir/bwa index $parent_genome_tag.genome.fa");
system("$bwa_dir/bwa mem -t $threads -M $parent_genome_tag.genome.fa $sample_tag.R1.trimmed.PE.fq.gz $sample_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff_for_mpileup - >$sample_tag.${parent_genome_tag}.bam");
system("$bwa_dir/bwa mem -t $threads -M $parent_genome_tag.genome.fa $sample_tag.R1.trimmed.PE.fq.gz $sample_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff_for_mpileup -F 3340 -f 2 - >$sample_tag.${parent_genome_tag}.bam");
if ($debug eq "no") {
system("rm $parent_genome_tag.genome.fa.bwt");
system("rm $parent_genome_tag.genome.fa.pac");
Expand Down
4 changes: 2 additions & 2 deletions scripts/batch_read_mapping_to_reference_genome.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
##############################################################
# script: batch_read_mapping_to_reference_genome.pl
# author: Jia-Xing Yue (GitHub ID: yjx1217)
# last edited: 2019.09.09
# last edited: 2021.06.17
# description: run batch read mapping to the reference genome
# example: perl batch_read_mapping_to_reference_genome.pl -i Master_Sample_Table.txt -t 4 -o output_dir -reference_genome_assembly_dir ./../01.Reference_Genome_Preprocessing -gamete_reads_dir ./../00.Gamete_Reads -min_mappability 0.85 -window_size 250 -ploidy 1
##############################################################
Expand Down Expand Up @@ -143,7 +143,7 @@
print("mapping the reads by bwa\n");
system("ln -s $reference_genome_assembly $reference_genome_tag.genome.fa");
system("$bwa_dir/bwa index $reference_genome_tag.genome.fa");
system("$bwa_dir/bwa mem -t $threads -M $reference_genome_tag.genome.fa $sample_tag.R1.trimmed.PE.fq.gz $sample_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff_for_mpileup - >${sample_tag}.${reference_genome_tag}.bam");
system("$bwa_dir/bwa mem -t $threads -M $reference_genome_tag.genome.fa $sample_tag.R1.trimmed.PE.fq.gz $sample_tag.R2.trimmed.PE.fq.gz | $samtools_dir/samtools view -bS -q $mapping_quality_cutoff_for_mpileup -F 3340 -f 2 - >${sample_tag}.${reference_genome_tag}.bam");
if ($debug eq "no") {
system("rm $reference_genome_tag.genome.fa.bwt");
system("rm $reference_genome_tag.genome.fa.pac");
Expand Down
Loading

0 comments on commit 4fbf1e1

Please sign in to comment.