forked from Nadolina/Rapid-curation-2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhap2_hap1_ID_mapping.sh
31 lines (17 loc) · 1.06 KB
/
hap2_hap1_ID_mapping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
# Tom Mathers.
# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.
# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.
# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2.
# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).
# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.
# Run on farm with at least 16 cores and 16 Gb ram.
if [ $# -ne 2 ]; then
echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
exit 1
fi
hap1=$1
hap2=$2
/rugpfs/fs0/vgl/store/vglshare/tools/VGP-tools/mashmap/mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16 -s 50000;
cut -d " " -f1 mashmap.out | grep -v SCAFFOLD | grep -v unloc | uniq > tmp; while read id; do awk -v val=${id} '$1==val' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv
rm tmp