-
Notifications
You must be signed in to change notification settings - Fork 10
Running a Tiny HG002 Example
cd into a directory with at least 200 GB of allocated Disk space
cd /data/$USER
Launch an interactive session on Biowulf and load requisite Biowulf modules:
sinteractive
module load cromwell/40 git python/3.6
Clone the github repo and create a work directory for running the wdl workflow:
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
mkdir -p ${VG_WDL_DIR} && cd ${VG_WDL_DIR}
git clone https://github.com/vgteam/vg_wdl.git
Create small workflow inputs directory, download workflow inputs, and setup miniwdl virtual environment:
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
${VG_WDL_DIR}/vg_wdl/scripts/setup_vg_wdl.sh -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
exit
Collect input read data
COHORT_NAME="HG002_tiny"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/${COHORT_NAME}_cohort_workdir"
${VG_WDL_DIR}/vg_wdl/scripts/setup_input_reads.sh -l "${COHORT_NAME}" -w ${COHORT_WORKFLOW_DIR} -t true
CD into cohort work directory and setup input variables.
Only need to change MATERNAL_SAMPLE_NAME
, PATERNAL_SAMPLE_NAME
and PROBAND_SAMPLE_NAME
from this template.
MATERNAL_SAMPLE_NAME="HG004"
PATERNAL_SAMPLE_NAME="HG003"
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
Setup workflow bash script
${VG_WDL_DIR}/vg_wdl/scripts/setup_trio_mapping_script.part_1.sh -p ${PROBAND_SAMPLE_NAME} -m ${MATERNAL_SAMPLE_NAME} -f ${PATERNAL_SAMPLE_NAME} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
Run the trio mapping workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=15:00:00 ${PROBAND_SAMPLE_NAME}_cohort_trio_map.part_1.sh
Setup workflow bash script
MATERNAL_SAMPLE_NAME="HG004"
PATERNAL_SAMPLE_NAME="HG003"
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
${VG_WDL_DIR}/vg_wdl/scripts/setup_trio_calling_script.part_2.sh -p ${PROBAND_SAMPLE_NAME} -m ${MATERNAL_SAMPLE_NAME} -f ${PATERNAL_SAMPLE_NAME} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
Run the trio genotyping workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=10:00:00 ${PROBAND_SAMPLE_NAME}_cohort_trio_call.part_2.sh
Setup workflow bash script
MATERNAL_SAMPLE_NAME="HG004"
PATERNAL_SAMPLE_NAME="HG003"
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
PED_FILE="${WORKFLOW_INPUT_DIR}/HG002.ped"
${VG_WDL_DIR}/vg_wdl/scripts/setup_parent_graph_construct_script.part_3.sh -p ${PROBAND_SAMPLE_NAME} -m ${MATERNAL_SAMPLE_NAME} -f ${PATERNAL_SAMPLE_NAME} -c ${PED_FILE} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
Run the parental graph construction workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=10:00:00 ${PROBAND_SAMPLE_NAME}_cohort_parental_graph_construction.part_3.sh
CD into cohort work directory and setup input variables.
The -s
flag in the setup_sibling_mapping_script.part_4.sh
script lists the sample IDs in the sibling list. The proband sample name should be listed first. Example: setup_sibling_mapping_script.part_4.sh -s 'HG002' -s 'SIBLING_1_ID' -s 'SIBLING_2_ID'
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
${VG_WDL_DIR}/vg_wdl/scripts/setup_sibling_mapping_script.part_4.sh -s ${PROBAND_SAMPLE_NAME} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
Run the sibling alignment workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=10:00:00 ${PROBAND_SAMPLE_NAME}_cohort_2nd_iter_sibling_map.part_4.sh
CD into cohort work directory and setup input variables.
The -s
flag in the setup_pedigree_calling_script.part_5.sh
script lists the sample IDs in the sibling list. The proband sample name should be listed first. Example: setup_pedigree_calling_script.part_5.sh -s 'HG002' -s 'SIBLING_1_ID' -s 'SIBLING_2_ID' ...
MATERNAL_SAMPLE_NAME="HG004"
PATERNAL_SAMPLE_NAME="HG003"
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/tiny_workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
${VG_WDL_DIR}/vg_wdl/scripts/setup_pedigree_calling_script.part_5.sh -s ${PROBAND_SAMPLE_NAME} -m ${MATERNAL_SAMPLE_NAME} -f ${PATERNAL_SAMPLE_NAME} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR} -t true
Run the cohort genotyping workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=10:00:00 ${PROBAND_SAMPLE_NAME}_cohort_2nd_iter_pedigree_call.part_5.sh
CD into cohort work directory and setup input variables.
The -s
flag in the setup_pedigree_calling_script.part_6.sh
script lists the sample IDs in the sibling list. The proband sample name should be listed first. Example: setup_pedigree_indel_realignment_script.part_6.sh -s 'HG002' -s 'SIBLING_1_ID' -s 'SIBLING_2_ID' ...
MATERNAL_SAMPLE_NAME="HG004"
PATERNAL_SAMPLE_NAME="HG003"
PROBAND_SAMPLE_NAME="HG002"
VG_WDL_DIR="/data/$USER/test_vg_wdl_run/wdl_tools"
WORKFLOW_INPUT_DIR="/data/$USER/test_vg_wdl_run/workflow_inputs"
COHORT_WORKFLOW_DIR="/data/$USER/test_vg_wdl_run/HG002_tiny_cohort_workdir"
${VG_WDL_DIR}/vg_wdl/scripts/setup_pedigree_indel_realignment_script.part_6.sh -s ${PROBAND_SAMPLE_NAME} -m ${MATERNAL_SAMPLE_NAME} -f ${PATERNAL_SAMPLE_NAME} -w ${COHORT_WORKFLOW_DIR} -g ${WORKFLOW_INPUT_DIR} -v ${VG_WDL_DIR}
Run the cohort genotyping workflow
cd ${COHORT_WORKFLOW_DIR}
sbatch --cpus-per-task=2 --mem=10g --gres=lscratch:50 --time=10:00:00 ${PROBAND_SAMPLE_NAME}_cohort_2nd_iter_pedigree_indel_realign.part_6.sh