hap2_hap1_ID_mapping.sh

#!/bin/bash

# Tom Mathers.

# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2. 

# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).

# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.

# Run on farm with at least 16 cores and 16 Gb ram.


if [ $# -ne 2 ]; then
    echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
    exit 1
fi


hap1=$1
hap2=$2

/rugpfs/fs0/vgl/store/vglshare/tools/VGP-tools/mashmap/mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16 -s 50000;

cut -d " " -f1 mashmap.out | grep -v SCAFFOLD | grep -v unloc | uniq > tmp; while read id; do awk -v val=${id} '$1==val' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv

rm tmp