Skip to content

Commit

Permalink
release candidate 20211212
Browse files Browse the repository at this point in the history
  • Loading branch information
yjx1217 committed Dec 25, 2021
1 parent 4fbf1e1 commit e2dbe6c
Show file tree
Hide file tree
Showing 117 changed files with 3,563 additions and 62 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added Manual.docx
Binary file not shown.
Binary file removed Manual.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ source ./../../env.sh
# set project-specific variables
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1"
master_sample_table="Master_Sample_Table.${batch_id}.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
net_quality_cutoff=20 # The net quality cutoff for genotyping. Default = "20".
net_quality_cutoff=50 # The net quality cutoff for genotyping. Default = "50".
apply_cnv_filter="yes" # Whether to set gamete genotype to NA for potential CNV regions in gametes. Set this option to "no" if the gamete sequencing depth is very low (e.g. <= 1). Default = "yes".
allow_heteroduplex="no" # Whether to consider the possibility of heteroduplex formation. Default = "no".
chr_list="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt" # The included chromosome list for the analyzed genome. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,25 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.$batch_id.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
merging_range=5000 # The distance range (bp) for merging nearby COs. Default = "5000" (i.e. 5000 bp).
net_quality_cutoff=20 # The net quality difference cutoff used in tetrad genotyping. Default = "20".
net_quality_cutoff=50 # The net quality difference cutoff used in tetrad genotyping. Default = "50".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_individual_recombination_event="yes" # Whether to plot individual recombination event, "yes" by default. Default = "yes".
plot_individual_recombination_event="no" # Whether to plot individual recombination event: "yes" or "no". Default = "no".
flanking="4000" # The recombination event flanking region (bp) for plotting. Default = "4000".
debug="no" # Whether to keep intermediate files for debuging. Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no".
###########################################





# process the pipeline
###########################################
# Normally no need to change the following parameters
genome_dir="./../01.Reference_Genome_Preprocessing" # The relative path to the 01.Reference_Genome_Preprocessing.
genotype_dir="./../04.Tetrad_Genotyping_by_Reference_Genome" # The relative path to the 04.Tetrad_Genotyping_by_Reference_Genome directory.
genotype_dir="./../04.Gamete_Genotyping_by_Reference_Genome" # The relative path to the 04.Gamete_Genotyping_by_Reference_Genome directory.
output_dir=$batch_id # output directory to create within this current directory
min_marker_number=1 # The minimal number of markers to be considered for trustful linkage blocks. Default = "1".
min_block_size=1 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "1" (i.e. 1 bp).
###########################################



# filtering tetrads by spore mapping depth
#perl $RECOMBINEX_HOME/scripts/filter_tetrads_by_sequencing_depth.pl -i $master_sample_table -o $master_sample_table.filtered \
# -d $spore_mapping_depth -c $mapping_depth_cutoff
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for gamete read mapping. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.${batch_id}.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
marker_dir="./../14.Polymorphic_Markers_by_Consensus" # The relative path to the "12.Polymorphic_Markers_by_Cross_Parent_Genome_Alignment" or "14.Polymorphic_Markers_by_Consensus" directory (for parental-genome-based analysis). Default = ./../14.Polymorphic_Markers_by_Consensus".
net_quality_cutoff=20 # The net quality cutoff for genotyping. Default = "20".
net_quality_cutoff=50 # The net quality cutoff for genotyping. Default = "50".
apply_cnv_filter="yes" # Whether to set gamete genotype to NA for potential CNV regions in gametes. Set this option to "no" if the gamete sequencing depth is very low (e.g. <= 1). Default = "yes".
allow_heteroduplex="no" # Whether to consider the possibility of heteroduplex formation. Default = "no".
chr_list="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt" # The chromosome list for the analyzed genome. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ source ./../../env.sh
batch_id="Batch_S288C-SK1" # The batch id used for the gamete read mapping analysis. Default = "Batch_S288C-SK1".
master_sample_table="Master_Sample_Table.$batch_id.txt" # The master sample table for this batch. Default = "Master_Sample_Table.${batch_id}.txt".
merging_range=5000 # The distance range (bp) for merging nearby COs. Default = "5000" (i.e. 5000 bp).
net_quality_cutoff=20 # The net quality difference cutoff used in tetrad genotyping. Default = "20".
net_quality_cutoff=50 # The net quality difference cutoff used in tetrad genotyping. Default = "50".
color_scheme="$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt" # The color scheme to use for plotting genotypes. Default = "$RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.color_scheme.txt".
plot_individual_recombination_event="yes" # Whether to plot individual recombination event, "yes" by default. Default = "yes".
plot_individual_recombination_event="no" # Whether to plot individual recombination event: "yes" or "no". Default = "no".
flanking="4000" # The recombination event flanking region (bp) for plotting. Default = "4000".
debug="no" # Whether to keep intermediate files for debuging. Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no".
###########################################
Expand All @@ -21,7 +21,7 @@ debug="no" # Whether to keep intermediate files for debuging. Use "yes" if prefe
###########################################
# Normally no need to change the following parameters
genome_dir="./../11.Parent_Genome_Preprocessing" # The relative path to the 11.Parent_Genome_Preprocessing directory.
genotype_dir="./../16.Tetrad_Genotyping_by_Parent_Genomes" # The relative path to the 16.Tetrad_Genotyping_by_Parent_Genomes directory.
genotype_dir="./../16.Gamete_Genotyping_by_Parent_Genomes" # The relative path to the 16.Gamete_Genotyping_by_Parent_Genomes directory.
output_dir=$batch_id # output directory to create within this current directory
min_marker_number=1 # The minimal number of markers to be considered for trustful linkage blocks. Default = "1".
min_block_size=1 # The minimal marker-bounded block size (bp) to be considered for trustful linkage blocks. Default = "1" (i.e. 1 bp).
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# RecombineX

<p align="center">
<img src="https://github.com/yjx1217/RecombineX/blob/master/RecombineX.logo.png" alt="RecombineX logo" width="547" height="162"/>
<img src="https://github.com/yjx1217/RecombineX/blob/master/RecombineX.logo.png" alt="RecombineX_logo" width="547" height="162"/>
</p>

**RecombineX: a computational framework for tetrad-based meiotic recombination analysis**
**RecombineX: a computational framework for high-throughput gamete genotyping and tetrad-based meiotic recombination analysis**

[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

## Description
<div style="text-align: justify">
RecombineX is a computational framework for tetrad-based genotyping and meiotic recombination analysis. It handles the full workflow of marker identification, tetrad genotyping, as well as recombination events profiling and classification and produces publication-quality plots. In addition to the conventional reference-genome-based approach, RecombineX also supports the analysis based on the native parent genomes, therefore permitting the close examination on how native parent genomic backgrounds may affect meiotic recombination landscapes of the resulting tetrads. Moreover, RecombineX can also handle partially viable tetrads (e.g. the tetrad with only 3 viable gametes) with its genotype inference feature, which is very useful for studying genome incompatibility. Also, RecombineX shines in its high scalability, capable of processing thousands of sequenced tetrads. Finally, we also developed a tetrad simulation module for RecombineX, which provides rich parameters for users to simulate recombinant tetrads with all introduced recombination events recorded in detail, which can be very useful for downstream hypothesis testing and software development.
Meiotic recombination is an essential biological process that ensures faithful chromosome segregation and promotes parental allele reshuffling. Tetrad analysis is a powerful approach to quantify the genetic makeups and recombination landscapes of meiotic products. Here we present RecombineX, an integrated computational framework that automates the full workflow of marker identification, gamete genotyping, and tetrad-based recombination profiling in a high-throughput fashion, capable of processing hundreds of tetrads in a single batch. Aside from conventional reference-based analysis, RecombineX can also perform analysis based on parental genome assemblies, which enables analyzing meiotic recombination landscapes in their native genomic contexts. Additional features such as copy number variation profiling and missing genotype inference further enhance downstream analysis. RecombineX also includes a dedicate module for simulating the genomes and reads of recombinant tetrads for any given organisms, which enables fine-tuned simulation-based hypothesis testing.
</div>

<p align="center">
<img src="https://github.com/yjx1217/RecombineX/blob/master/RecombineX.overview.png" alt="RecombineX logo" width="915" height="888"/>
<img src="https://github.com/yjx1217/RecombineX/blob/master/RecombineX.overview.png" alt="RecombineX_overview" width="915" height="888"/>
</p>

Under the hood, a series of task-specific modules are provided to carry out the full workflow of RecombineX:
Expand All @@ -33,8 +33,8 @@ Under the hood, a series of task-specific modules are provided to carry out the
* identifying polymorphic markers between the two crossing parents based on the reference genome (for the "reference-based" mode only)
* **03.Gamete_Read_Mapping_to_Reference_Genome**
* mapping the reads of labeled gametes to the reference genome (for the "reference-based" mode only)
* **04.Tetrad_Genotyping_by_Reference_Genome**
* assigning genotypes to labeled gametes from the same tetrad based on the reference genome (for the "reference-based" mode only)
* **04.Gamete_Genotyping_by_Reference_Genome**
* assigning genotypes to a list of pre-defined gametes based on the reference genome (for the "reference-based" mode only)
* **05.Recombination_Profiling_by_Reference_Genome**
* profiling and classifying recombination events for each tetrad based on the reference genome (for the "reference-based" mode only)
* **11.Parent_Genome_Preprocessing**
Expand All @@ -47,8 +47,8 @@ Under the hood, a series of task-specific modules are provided to carry out the
* identifying consensus polymorphic markers between the two crossing parents based on both whole genome alignment and cross-parent read mapping (for the "parent-based" mode only)
* **15.Gamete_Read_Mapping_to_Parent_Genomes**
* mapping the reads of labeled gametes to the genomes of two native parents (for the "parent-based" mode only)
* **16.Tetrad_Genotyping_by_Parent_Genomes**
* assigning genotypes to labeled gametes from the same tetrad based on parent genomes (for the "parent-based" mode only)
* **16.Gamete_Genotyping_by_Parent_Genomes**
* assigning genotypes to a list of pre-defined gametes from the same tetrad based on parent genomes (for the "parent-based" mode only)
* **17.Recombination_Profiling_by_Parent_Genomes**
* profiling and classifying recombination events for each tetrad based on parent genomes (for the "parent-based" mode only)
* **20.Recombinant_Tetrad_Simulation**
Expand Down
Binary file modified RecombineX.overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/AND1702_12a.assembly.final.fa.gz
Binary file not shown.
Binary file added data/AND1702_12a.assembly.final.gff3.gz
Binary file not shown.
Binary file added data/AND1702_8a.assembly.final.fa.gz
Binary file not shown.
Binary file added data/AND1702_8a.assembly.final.gff3.gz
Binary file not shown.
5 changes: 2 additions & 3 deletions data/Chlamydomonas_reinhardtii.color_scheme.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
NA #e5e5e5
CC408 #ffff33
CC2935 #0099ff
CC2936 #4daf4a
CC1010 #ff3300
CC124 #4daf4a
heteroduplex #ffc425

21 changes: 11 additions & 10 deletions data/Saccharomyces_cerevisiae.color_scheme.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
NA #e5e5e5
P1 #ff3300
P2 #0099ff
S288C #ff3300
SK1 #0099ff
DBVPG6044 #ff3300
DBVPG6765 #66cc33
Y12 #0099ff
YPS128 #ffcc00
heteroduplex #ffc425
NA "#e5e5e5"
heteroduplex "#ffa500"
#P1 "#ff3300"
#P2 "#0099ff"
S288C "#ff3300"
SK1 "#0099ff"
#DBVPG6044 "#ff3300"
#DBVPG6765 "#66cc33"
#Y12 "#0099ff"
#YPS128 "#ffcc00"

5 changes: 3 additions & 2 deletions install_dependencies.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# last update: 2021/03/04
# last update: 2021/12/07

set -e -o pipefail

Expand Down Expand Up @@ -148,7 +148,7 @@ VCFLIB_DOWNLOAD_URL="https://github.com/vcflib/vcflib/releases/download/v${VCFLI

VT_VERSION="" #
VT_GITHUB_COMMIT_VERSION="f6d2b5d" # committed on 2018.08.01
VT_DOWNLOAD_URL="https://github.com/atks/vt"
VT_DOWNLOAD_URL="git://github.com/atks/vt"

FREEC_VERSION="11.4" # released on 2018.04.27
FREEC_DOWNLOAD_URL="https://github.com/BoevaLab/FREEC/archive/v${FREEC_VERSION}.tar.gz"
Expand Down Expand Up @@ -510,6 +510,7 @@ if [ -z $(check_installed $freebayes_dir) ]; then
chmod 755 freebayes-${FREEBAYES_VERSION}-linux-static-AMD64
ln -s freebayes-${FREEBAYES_VERSION}-linux-static-AMD64 freebayes
cd $build_dir
rm freebayes-${FREEBAYES_VERSION}-src.tar.gz
note_installed $freebayes_dir
fi

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
set -e -o pipefail

#######################################
# load environment variables for Varathon
source ./../../env.sh

#######################################
# set project-specific variables
ref_genome_prefix="Chlamydomonas_reinhardtii" # The file name prefix of the reference genome. Default = "Chlamydomonas_reinhardtii".
ref_genome_download_URL="ftp://ftp.ensemblgenomes.org/pub/plants/release-49/fasta/chlamydomonas_reinhardtii/dna/Chlamydomonas_reinhardtii.Chlamydomonas_reinhardtii_v5.5.dna_sm.toplevel.fa.gz" # The URL for downloading the reference genome. Default = "ftp://ftp.ensemblgenomes.org/pub/plants/release-49/fasta/chlamydomonas_reinhardtii/dna/Chlamydomonas_reinhardtii.Chlamydomonas_reinhardtii_v5.5.dna_sm.toplevel.fa.gz".
chr_list="./../../data/Chlamydomonas_reinhardtii.chr_list.txt" # The single-column list defining chromosomes/scaffolds/contigs to be included. Default = ./../../data/Chlamydomonas_reinhardtii.chr_list.txt".
debug="no" # Whether to keep intermediate files for debuging. Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no".
#######################################




#######################################
# process the pipeline

download_and_extract() {
url=$1
echo "Downloading $url"
if [[ $url =~ \.gz$ ]];
#if [[ $url =~ \.fa.gz$ || $url =~ \.fasta.gz$ ]];
then
download_location="$ref_genome_prefix.raw.fa.gz"
extract_command="gunzip"
wget -c --no-check-certificate $url -O $download_location
gunzip $download_location
else
download_location="$ref_genome_prefix.raw.fa"
wget -c --no-check-certificate $url -O $download_location
fi
}

echo ""
echo "Retrieve the sample reference genome assembly ..."
download_and_extract $ref_genome_download_URL
echo ""
echo "Tidy the sample reference genome assembly ..."
$RECOMBINEX_HOME/scripts/tidy_fasta.pl -i $ref_genome_prefix.raw.fa -o $ref_genome_prefix.tidy.fa
sed -i "s/>/>chr/gi" $ref_genome_prefix.tidy.fa
$RECOMBINEX_HOME/scripts/select_fasta_by_list.pl -i $ref_genome_prefix.tidy.fa -l $chr_list -m normal -o $ref_genome_prefix.tidy.lite.fa.gz

if [[ $debug = "no" ]]
then
echo ""
echo "Removing intermediate files ..."
rm $ref_genome_prefix.raw.fa
rm $ref_genome_prefix.tidy.fa
fi

############################
# checking bash exit status
if [[ $? -eq 0 ]]
then
echo ""
echo "RecombineX message: This bash script has been successfully processed! :)"
echo ""
echo ""
exit 0
fi
############################
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
set -e -o pipefail

#######################################
# load environment variables for RecombineX
source ./../../env.sh

#######################################
# set project-specific variables

debug="no" # Whether to keep intermediate files for debuging. Use "yes" if prefer to keep intermediate files, otherwise use "no". Default = "no".

#######################################



######################################
# process the pipeline
echo "retrieve sample reference genome data ..."
wget -c https://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-2-1_20150113.tgz
tar -xvzf S288C_reference_genome_R64-2-1_20150113.tgz
cp ./S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa SGDref.genome.raw.fa
cp ./S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff SGDref.all_feature.gff
perl $RECOMBINEX_HOME/scripts/tidy_SGDref_genome.pl -i SGDref.genome.raw.fa -o SGDref.genome.tidy.fa
perl $RECOMBINEX_HOME/scripts/select_fasta_by_list.pl -i SGDref.genome.tidy.fa -l $RECOMBINEX_HOME/data/Saccharomyces_cerevisiae.chr_list.txt -o SGDref.genome.fa -m normal
gzip SGDref.genome.fa
perl $RECOMBINEX_HOME/scripts/filter_gff_by_feature.pl -i SGDref.all_feature.gff -o SGDref.centromere.gff -f centromere -m keep

# echo "retrieve sample subtelomere GFF files ..."
# cp $RECOMBINEX_HOME/data/Saccharomyces_cerevisiae_subtelomere_gff3/SGDref.subtelomere.gff .

if [[ $debug = "no" ]]
then
echo ""
echo "removing intermediate files and directories ..."

rm -rf S288C_reference_genome_R64-2-1_20150113*
rm SGDref.genome.raw.fa
rm SGDref.genome.tidy.fa
rm SGDref.all_feature.gff
fi

############################
# checking bash exit status
if [[ $? -eq 0 ]]
then
echo ""
echo "RecombineX message: This bash script has been successfully processed! :)"
echo ""
echo ""
exit 0
fi
############################
48 changes: 48 additions & 0 deletions pipelines/RecombineX.00.Prepare_Sample_Parent_Genomes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash
set -e -o pipefail

#######################################
# load environment variables for RecombineX
source ./../../env.sh

#######################################
# set project-specific variables

# none

#######################################
# process the pipeline

echo "retrieve sample parental genome data ..."
for i in S288C SK1
do
cp $RECOMBINEX_HOME/data/$i.genome.fa .
cp $RECOMBINEX_HOME/data/$i.all_feature.gff .
perl $RECOMBINEX_HOME/scripts/filter_gff_by_feature.pl -i $i.all_feature.gff -o $i.centromere.gff -f centromere -m keep
done

# echo "retrieve sample subtelomere GFF files ..."
# for i in S288C SK1
# do
# cp $RECOMBINEX_HOME/data/Saccharomyces_cerevisiae_subtelomere_gff3/$i.subtelomere.gff .
# done

echo ""
echo "removing intermediate files and directories ..."
for i in S288C SK1
do
rm $i.all_feature.gff
done


############################
# checking bash exit status
if [[ $? -eq 0 ]]
then
echo ""
echo "RecombineX message: This bash script has been successfully processed! :)"
echo ""
echo ""
exit 0
fi
############################
Loading

0 comments on commit e2dbe6c

Please sign in to comment.